code_block,too_long,marks,graph_vertex_id "# My forecasting COVID-19 confirmed cases and fatalities between March 19 and April 30 # My submission scored 0.52281 import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import cross_val_score from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error # model from catboost import Pool from catboost import CatBoostRegressor from xgboost import XGBRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import BaggingRegressor #plot pd.plotting.register_matplotlib_converters() import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename))",No,2,45.0 "# load training and testing data subm = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv') train_data = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv', index_col='Id', parse_dates=True) test_data = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv', index_col='ForecastId', parse_dates=True)",No,5,45.0 subm,No,5,41.0 "# see testing data test_data",No,5,41.0 "# ...and training data train_data",No,5,41.0 train_data.describe(),No,5,40.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # Any results you write to the current directory are saved as output.",No,3,22.0 train_data.describe(include=['O']),No,5,40.0 test_data.describe(),No,5,40.0 test_data.describe(include=['O']),No,5,40.0 train_data.shape,No,5,58.0 test_data.shape,No,5,58.0 "# detect missing values in training train_data.isna().sum()",No,5,39.0 "# ...in testing data test_data.isna().sum()",No,5,39.0 "#metric def RMSLE(pred,actual): return np.sqrt(np.mean(np.power((np.log(pred+1)-np.log(actual+1)),2)))",No,4,28.0 "#Convert data in integer train_data[Date]= pd.to_datetime(train_data[Date]).dt.strftime(""%m%d"").astype(int) test_data[Date]= pd.to_datetime(test_data[Date]).dt.strftime(""%m%d"").astype(int)",No,5,16.0 "# separate the vector correct answers ('ConfirmedCases' and 'Fatalities') from the training data train_data.dropna(axis=0, subset=['ConfirmedCases', 'Fatalities'], inplace=True) y_conf = train_data.ConfirmedCases train_data.drop(['ConfirmedCases'], axis=1, inplace=True) y_fatal = train_data.Fatalities train_data.drop(['Fatalities'], axis=1, inplace=True)",No,4,17.0 "# Select categorical columns in training and testing data categorical_cols = [cname for cname in train_data.columns if train_data[cname].dtype == ""object""]",No,5,77.0 "# replace missing values in training and testing data # as we saw above, the data are absent only in 'Province/State' train_data.fillna('-', inplace=True) test_data.fillna('-',inplace=True)",No,5,17.0 "# perform LabelEncoder with categorical data (categorical_cols) encodering = LabelEncoder() encod_train_data = train_data.copy() encod_test_data = test_data.copy() for col in categorical_cols: encod_train_data[col] = encodering.fit_transform(train_data[col]) encod_test_data[col] = encodering.fit_transform(test_data[col])",No,4,7.0 "# split encod_train_data into training(X_train) and validation(X_valid) data # and split vector correct answers ('ConfirmedCases') X_train, X_valid, y_train, y_valid = train_test_split(encod_train_data, y_conf, train_size=0.8, test_size=0.2, random_state=0)",No,5,13.0 "# determine the best metrics for the model def get_score(n_estimators): model = GradientBoostingRegressor(n_estimators=n_estimators) scores = cross_val_score(model, X_train, y_train, cv=5) return scores.mean()",No,5,84.0 "def rmse_score(n_estimators): rmse = np.sqrt(-cross_val_score(GradientBoostingRegressor(n_estimators=n_estimators), X_train, y_train, scoring=""neg_mean_squared_error"", cv = 5)) return(rmse)",No,5,84.0 "# select model and install parameters model = CatBoostRegressor(iterations=4000, depth=9, learning_rate=0.5, loss_function='RMSE')",No,5,4.0 "# train the model model.fit(X_train,y_train)",No,5,7.0 "# preprocessing of validation data, get predictions preds = model.predict(X_valid) print('MAE:', mean_absolute_error(y_valid, preds))",No,4,27.0 "# make the prediction using the resulting model preds = model.predict(X_valid) print('MSE:', mean_squared_error(y_valid, preds))",No,3,48.0 "x_list = [X_train, X_valid] y_list = [y_train, y_valid] scoring = list(map(lambda x,y: round(model.score(x,y)*100, 2), x_list, y_list)) scoring",No,4,49.0 "# get predictions test data final_preds_conf = model.predict(encod_test_data)",No,5,48.0 "# split encod_train_data into training(X_train) and validation(X_valid) data # and split vector correct answers ('Fatalities') X_train_f, X_valid_f, y_train_f, y_valid_f = train_test_split(encod_train_data, y_fatal, train_size=0.8, test_size=0.2, random_state=0)",No,5,13.0 "# train the model model.fit(X_train_f,y_train_f)",No,5,7.0 "# preprocessing of validation data, get predictions preds = model.predict(X_valid_f) print('MAE:', mean_absolute_error(y_valid_f, preds))",No,4,27.0 "# make the prediction using the resulting model preds = model.predict(X_valid_f) print('MSE:', mean_squared_error(y_valid_f, preds))",No,3,48.0 "x_list_f = [X_train_f, X_valid_f] y_list_f = [y_train_f, y_valid_f] scoring = list(map(lambda x,y: round(model.score(x,y)*100, 2), x_list_f, y_list_f)) scoring",No,4,49.0 "# get predictions test data final_preds_fatal = model.predict(encod_test_data)",No,5,48.0 "# and save test predictions to file output.to_csv('submission.csv', index=False) print('Complete!')",No,5,25.0 output.tail(30),No,5,41.0 output.describe(),No,5,40.0 !pip install mxnet autogluon,No,5,87.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import seaborn as sns import matplotlib.pyplot as plt # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # Any results you write to the current directory are saved as output.",No,5,88.0 "import pandas as pd import numpy as np import shutil, os from autogluon import TabularPrediction as task directory = '../input/covid19-global-forecasting-week-2/' label_cases = 'ConfirmedCases' # name of target variable to predict in this competition label_fatalities = 'Fatalities' outputdir_cases = 'AGmodels_' + label_cases + '/' # where to store trained models outputdir_fatalities = 'AGmodels_' + label_fatalities + '/' # where to store trained models if os.path.exists(outputdir_cases): shutil.rmtree(outputdir_cases) if os.path.exists(outputdir_fatalities): shutil.rmtree(outputdir_fatalities) train_data = task.Dataset(file_path=directory+'train.csv') train_data.drop([""Id""], axis=1, inplace=True) log_cases_vals = np.log(train_data[label_cases] + 1) log_fatalities_vals = np.log(train_data[label_fatalities] + 1) train_data[label_fatalities] = log_fatalities_vals train_data[label_cases] = log_cases_vals train_data_cases = train_data.drop([label_fatalities], axis=1) train_data_fatalities = train_data.drop([label_cases], axis=1) train_data.head()'",No,2,45.0 "df_train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv') df_test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv') df_submit = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv')",No,5,45.0 df_train.head(),No,5,41.0 df_train.info(),No,5,40.0 "print(""Number of Country_Region: "", df_train['Country_Region'].nunique()) print(""Dates are ranging from day"", min(df_train['Date']), ""to day"", max(df_train['Date']), "", a total of"", df_train['Date'].nunique(), ""days"") print(""The countries that have Province/Region given are : "", df_train[df_train['Province_State'].isna()==False]['Country_Region'].unique())'",No,5,54.0 df_train.columns,No,5,71.0 df_train['Province_State'].unique(),No,5,57.0 "plt.figure(figsize=(40,40)) temp_df= df_train[df_train['ConfirmedCases']>5000] sns.barplot(y = temp_df['Country_Region'] , x = temp_df['ConfirmedCases']>10000) sns.set_context('paper') plt.ylabel(""Country_Region"",fontsize=30) plt.xlabel(""Counts"",fontsize=30) plt.title(""Counts of Countries affected by the pandemic that have confirmed cases > 5000"",fontsize=30) plt.xticks(rotation = 90)'",No,4,53.0 "confirmed_total_dates = df_train.groupby(['Date']).agg({'ConfirmedCases':['sum']}) fatalities_total_dates = df_train.groupby(['Date']).agg({'Fatalities':['sum']}) total_dates = confirmed_total_dates.join(fatalities_total_dates) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(17,7)) total_dates.plot(ax=ax1) ax1.set_title(""Global confirmed cases"", size=13) ax1.set_ylabel(""Total Number of cases"", size=13) ax1.set_xlabel(""Date"", size=13) fatalities_total_dates.plot(ax=ax2, color='orange') ax2.set_title(""Global deceased cases"", size=13) ax2.set_ylabel(""Total Number of cases"", size=13) ax2.set_xlabel(""Date"", size=13)'",No,4,40.0 "italy = df_train[df_train['Country_Region'] == 'Italy'] plt.figure(figsize=(20,10)) sns.lineplot(x = 'Date' , y = 'ConfirmedCases' , data = italy) plt.xticks(rotation = 90,size=12) plt.xlabel('Date',size=15) plt.ylabel('Confirmed Cases',size=15) plt.title('Confirmed Cases per date in Italy',size=20) plt.show()",No,5,75.0 "italy = df_train[df_train['Country_Region'] == 'Italy'] plt.figure(figsize=(20,10)) sns.lineplot(x = 'Date' , y = 'Fatalities' , data = italy,color='orange') plt.xticks(rotation = 90,size=12) plt.xlabel('Date',size=15) plt.ylabel('Fatalities',size=15) plt.title('Fatalities in Italy per Date',size=20) plt.show()",No,5,75.0 "usa = df_train[df_train['Country_Region'] == 'US'] plt.figure(figsize=(20,10)) sns.lineplot(x = 'Date' , y = 'ConfirmedCases' , data = usa,color='g') plt.xticks(rotation = 90,size=13) plt.xlabel('Date',size=15) plt.ylabel('Confirmed Cases',size=15) plt.title('Confirmed Cases in US per Date',size=20) plt.show()",No,5,75.0 "plt.figure(figsize=(20,10)) sns.lineplot(x = 'Date' , y = 'Fatalities' , data = usa,color='purple') plt.title('Fatalities in US per Date',size=20) plt.xticks(rotation = 90,size=13) plt.xlabel('Date',size=15) plt.ylabel('Fatalities',size=15) plt.show()",No,5,75.0 "plt.figure(figsize=(20,10)) sns.barplot(x='Province_State',y='ConfirmedCases',data=usa,ci=None) plt.xticks(rotation = 90,size=13) plt.xlabel('Province_State',size=15) plt.ylabel('Confirmed Cases',size=15) plt.title('Confirmed Cases in US Province_State ',size=20) plt.show()",No,5,33.0 "#we now do the analysis of NYC as per week. import warnings warnings.filterwarnings('ignore') temp_df = usa[usa['Province_State'] == 'New York'] temp_df['Date'] = pd.to_datetime(temp_df['Date']) temp_df.insert(6,'Week',temp_df['Date'].dt.week) f,axes = plt.subplots(1,2,figsize=(12,5)) sns.lineplot(x = 'Week',y = 'ConfirmedCases',color='r',data=temp_df,ax = axes[0]) sns.lineplot(x = 'Week',y = 'Fatalities',color='b',data=temp_df,ax = axes[1]) axes[0].title.set_text('Confirmed Cases in NYC per week') axes[1].title.set_text('Fatalities in NYC per week')",No,4,14.0 "china = df_train[df_train['Country_Region'] == 'China'] plt.figure(figsize=(20,10)) sns.lineplot(x = 'Date' , y = 'ConfirmedCases' , data = china,color='aqua') plt.xticks(rotation = 90,size=12) plt.xlabel('Date',size=15) plt.ylabel('Confirmed Cases',size=15) sns.set_context('paper') plt.title('Confirmed Cases in China per Date',size=20) plt.show()",No,5,75.0 "china = df_train[df_train['Country_Region'] == 'China'] plt.figure(figsize=(20,10)) sns.lineplot(x = 'Date' , y = 'Fatalities' , data = china,color='grey') plt.xticks(rotation = 90,size=12) plt.xlabel('Date',size=15) plt.ylabel('Fatalities',size=15) sns.set_context('paper') plt.title('Fatalities in China per Date',size=20) plt.show()",No,5,75.0 "plt.figure(figsize=(20,10)) sns.barplot(x='Province_State',y='ConfirmedCases',data=china) plt.xticks(rotation = 90,size=13) plt.title('Confirmed Cases in China Province_State',size=20) plt.ylabel('Confirmed Cases',size=15) plt.xlabel('Province_State',size=15) plt.show()",No,5,33.0 "#we now do the analysis of Hubei as per week. import warnings warnings.filterwarnings('ignore') china_t = china[china['Province_State'] == 'Hubei'] china_t['Date'] = pd.to_datetime(china_t['Date']) china_t.insert(6,'Week',china_t['Date'].dt.week) f,axes = plt.subplots(1,2,figsize=(12,5)) sns.lineplot(x = 'Week',y = 'ConfirmedCases',color='r',data=china_t,ax = axes[0]) sns.lineplot(x = 'Week',y = 'Fatalities',color='b',data=china_t,ax = axes[1]) axes[0].title.set_text('Confirmed Cases in Hubei per week') axes[1].title.set_text('Fatalities in Hubei per week')",No,4,14.0 "df_train = df_train[['Date','Province_State','Country_Region','ConfirmedCases','Fatalities']] df_train.head()",No,3,10.0 "print(""Read in libraries"") import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.optimize import curve_fit from statsmodels.tsa.statespace.sarimax import SARIMAX from statsmodels.tsa.arima_model import ARIMA from random import random",No,5,22.0 "import pandas as pd import numpy as np # Very big number to be used for a parameter values of some models BIG_NUMBER = 1000000",No,5,77.0 "train_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv', na_filter=False) test_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv', na_filter=False) submission_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv')",No,5,45.0 train_df,No,5,41.0 test_df,No,5,41.0 submission_df,No,5,41.0 "train_df = train_df[[""Province_State"", ""Country_Region"", ""Date"", ""ConfirmedCases"", ""Fatalities""]] train_df",No,5,10.0 "len(set(train_df[""Country_Region""])) == len(set(test_df[""Country_Region""]))",No,3,37.0 "set(train_df[""Country_Region""]) == set(test_df[""Country_Region""])",No,3,37.0 "countries = set(train_df[""Country_Region""]) countries",No,5,77.0 "# This function assumes that the number of confirmed cases/fatalities doubles every n days # The task is to find the optimal n from curve fitting, separately for cases and fatalities def func(x, b, n): return x * (b ** (1/n))",No,5,53.0 "from scipy.optimize import curve_fit # Iterate through countries sorted alphabetically from A to Z. # As some countries, like USA, China, Canada, UK, Australia, have provinces/states, decend to the province level # (i.e., each province within such countries gets its owm model) for country in sorted(countries): print(""Country: "", country) # Select information related to the current country c_df = train_df[train_df[""Country_Region""] == country] # Get a list of country's provinces provinces = set(c_df[""Province_State""]) print(""Provinces: "", provinces) # Iterate over provinces for province in sorted(provinces): # Create a compound name for each country when provinces are present if province != """": full_country = country + ""-"" + province else: full_country = country # From country information, select the current province information p_df = c_df[c_df[""Province_State""] == province] # Prepare data for building a model X1 = p_df[p_df[""ConfirmedCases""] > 0][""ConfirmedCases""].values[:-1] # Omit the last value in order to properly form labels y1 = p_df[p_df[""ConfirmedCases""] > 0][""ConfirmedCases""].values[1:] # Notice that ""labels"" are in fact ""data"" shifted one position to the right X2 = p_df[p_df[""Fatalities""] > 0][""Fatalities""].values[:-1] # Omit the last value in order to properly form labels y2 = p_df[p_df[""Fatalities""] > 0][""Fatalities""].values[1:] # Notice that ""labels"" are in fact ""data"" shifted one position to the right # For confirmed cases, find the optimal value of a model parameter and perform the curve fitting if possible # Treat special cases when either X or y or both contains all zeroes or just one (last) non-zero value! if len(X1) > 1 and len(y1) > 1: # Build a model only if there are two or more non-zero values popt, _ = curve_fit(func, X1, y1) popt_cases = popt # there is just one parameter else: # otherwise, just set the parameter to a very big number, implying that there would be almost no change in numbers popt_cases = 2, BIG_NUMBER # Treat the special case if it turned out that the parameter value is zero if popt_cases[1] == 0: # Set the parameter to a very large value m so that the quantity 2**(1/m) -> 1, which implies that # the numbers won't grow popt_cases = 2, BIG_NUMBER print(""{}: Optimal parameter value for confirmed cases: {}"".format(full_country, popt_cases)) # For fatalities, find the optimal value of a model parameter and perform the curve fitting if possible # Treat special cases when either X or y or both contains all zeroes or just one (last) non-zero value! if len(X2) > 1 and len(y2) > 1: popt, _ = curve_fit(func, X2, y2) popt_fatalities = popt # there is just one parameter else: # otherwise, just set the parameter to a very big number, implying that there would be almost no change in numbers popt_fatalities = 2, BIG_NUMBER # Treat the special case if it turned out that the parameter value is zero if popt_fatalities[1] == 0: # Set the parameter to a very large value m so that the quantity 2**(1/m) -> 1, which implies that # the numbers won't grow popt_fatalities = 2, BIG_NUMBER print(""{}: Optimal parameter value for fatalities: {}"".format(full_country, popt_fatalities)) # Select test data for a given country and its province if the latter is given condition = (test_df[""Province_State""] == province) & (test_df[""Country_Region""] == country) t_df = test_df[condition] # Get the initial values to be used for generating future values for confirmed cases and fatalities last_train_date = t_df[""Date""].values[0] print(last_train_date) cases = p_df[p_df[""Date""] == last_train_date][""ConfirmedCases""].values[0] print(cases) fatalities = p_df[p_df[""Date""] == last_train_date][""Fatalities""].values[0] print(fatalities) # It's necessary to drop index in 't_df': otherwise, 't_df.loc[i, ""ForecastId""]' would fail, # starting from the second country t_df.reset_index(inplace=True, drop=True) for i in range(t_df.shape[0]): # Get a row index to write to idx = t_df.loc[i, ""ForecastId""] - 1 # make predictions cases = round(cases * (popt_cases[0] ** (1/popt_cases[1])), 0) submission_df.loc[idx, ""ConfirmedCases""] = cases fatalities = round(fatalities * (popt_fatalities[0] ** (1/popt_fatalities[1])), 0) submission_df.loc[idx, ""Fatalities""] = fatalities print(""*""*70)'",No,5,53.0 "submission_df.to_csv(""submission.csv"", index=False, header=True)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): print(dirname) for filename in filenames: print(os.path.join(dirname, filename)) # Any results you write to the current directory are saved as output. PATH_WEEK2='/kaggle/input/covid19-global-forecasting-week-2' df_train = pd.read_csv(f'{PATH_WEEK2}/train.csv') df_test = pd.read_csv(f'{PATH_WEEK2}/test.csv') print(""*""*100) print(df_train.head()) print(""*""*100) print(df_test.head()) print(""*""*100)'",No,3,45.0 "df_train.rename(columns={'Country_Region':'Country'}, inplace=True) df_test.rename(columns={'Country_Region':'Country'}, inplace=True) df_train.rename(columns={'Province_State':'State'}, inplace=True) df_test.rename(columns={'Province_State':'State'}, inplace=True) df_train['Date'] = pd.to_datetime(df_train['Date'], infer_datetime_format=True) df_test['Date'] = pd.to_datetime(df_test['Date'], infer_datetime_format=True) print(""*""*50) print(df_train.info()) print(""*""*50) print(df_test.info()) print(""*""*50)'",No,3,16.0 "NULL_VAL = ""NULL_VAL"" def fillState(state, country): if state == NULL_VAL: return country return state X_Train = df_train.loc[:, ['State', 'Country', 'Date', 'ConfirmedCases', 'Fatalities']] X_Train['State'].fillna(NULL_VAL, inplace=True) X_Train['State'] = X_Train.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1) X_Train.loc[:, 'Date'] = X_Train.Date.dt.strftime(""%m%d"") X_Train[""Date""] = X_Train[""Date""].astype(int) X_Test = df_test.loc[:, ['State', 'Country', 'Date', 'ForecastId']] X_Test['State'].fillna(NULL_VAL, inplace=True) X_Test['State'] = X_Test.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1) X_Test.loc[:, 'Date'] = X_Test.Date.dt.strftime(""%m%d"") X_Test[""Date""] = X_Test[""Date""].astype(int) print(""*""*50) print(X_Train.head()) print(""*""*50) print(X_Test.head()) print(""*""*50)'",No,3,17.0 "from sklearn.preprocessing import LabelEncoder le = LabelEncoder() X_Train.Country = le.fit_transform(X_Train.Country) X_Train['State'] = le.fit_transform(X_Train['State']) X_Test.Country = le.fit_transform(X_Test.Country) X_Test['State'] = le.fit_transform(X_Test['State']) print(""*""*50) print(X_Train.head()) print(""*""*50) print(X_Test.head()) print(""*""*50)'",No,4,20.0 "from xgboost import XGBRegressor from sklearn.preprocessing import LabelEncoder le = LabelEncoder() countries = X_Train.Country.unique() df_out = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []}) for country in countries: states = X_Train.loc[X_Train.Country == country, :].State.unique() for state in states: condition_train = (X_Train.Country == country) & (X_Train.State == state) # Get X and y (train) X_Train_CS = X_Train.loc[condition_train, ['State', 'Country', 'Date', 'ConfirmedCases', 'Fatalities']] y1_Train_CS = X_Train_CS.loc[:, 'ConfirmedCases'] y2_Train_CS = X_Train_CS.loc[:, 'Fatalities'] X_Train_CS = X_Train_CS.loc[:, ['State', 'Country', 'Date']] # Get X and y (test) condition_test = (X_Test.Country == country) & (X_Test.State == state) X_Test_CS = X_Test.loc[condition_test, ['State', 'Country', 'Date', 'ForecastId']] # Save forcast id for submission X_Test_CS_Id = X_Test_CS.loc[:, 'ForecastId'] X_Test_CS = X_Test_CS.loc[:, ['State', 'Country', 'Date']] model1 = XGBRegressor(n_estimators=1000) model1.fit(X_Train_CS, y1_Train_CS) y1_pred = model1.predict(X_Test_CS) model2 = XGBRegressor(n_estimators=1000) model2.fit(X_Train_CS, y2_Train_CS) y2_pred = model2.predict(X_Test_CS) df = pd.DataFrame({'ForecastId': X_Test_CS_Id, 'ConfirmedCases': y1_pred, 'Fatalities': y2_pred}) df_out = pd.concat([df_out, df], axis=0) # Done for state loop # Done for country Loop df_out.ForecastId = df_out.ForecastId.astype('int') df_out.tail()",No,3,7.0 "df_out.to_csv('submission.csv', index=False)",No,5,25.0 "import numpy as np import pandas as pd import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename))",No,5,88.0 "PATH_WEEK2='/kaggle/input/covid19-global-forecasting-week-2' df_train = pd.read_csv(f'{PATH_WEEK2}/train.csv') df_test = pd.read_csv(f'{PATH_WEEK2}/test.csv')",No,5,45.0 df_test.head(),No,5,41.0 "df_train.rename(columns={'Country_Region':'Country'}, inplace=True) df_test.rename(columns={'Country_Region':'Country'}, inplace=True) df_train.rename(columns={'Province_State':'State'}, inplace=True) df_test.rename(columns={'Province_State':'State'}, inplace=True)",No,5,61.0 "df_train['Date'] = pd.to_datetime(df_train['Date'], infer_datetime_format=True) df_test['Date'] = pd.to_datetime(df_test['Date'], infer_datetime_format=True)",No,5,16.0 df_test.info(),No,5,40.0 "y1_Train = df_train.iloc[:, -2] y1_Train.head()",No,3,21.0 "y2_Train = df_train.iloc[:, -1] y2_Train.head()",No,4,14.0 "EMPTY_VAL = ""EMPTY_VAL"" def fillState(state, country): if state == EMPTY_VAL: return country return state",No,5,53.0 "#X_Train = df_train.loc[:, ['State', 'Country', 'Date']] X_Train = df_train.copy() X_Train['State'].fillna(EMPTY_VAL, inplace=True) X_Train['State'] = X_Train.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1) X_Train.loc[:, 'Date'] = X_Train.Date.dt.strftime(""%m%d"") X_Train[""Date""] = X_Train[""Date""].astype(int) X_Train.head()'",No,3,17.0 "#X_Test = df_test.loc[:, ['State', 'Country', 'Date']] X_Test = df_test.copy() X_Test['State'].fillna(EMPTY_VAL, inplace=True) X_Test['State'] = X_Test.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1) X_Test.loc[:, 'Date'] = X_Test.Date.dt.strftime(""%m%d"") X_Test[""Date""] = X_Test[""Date""].astype(int) X_Test.head()'",No,4,17.0 "from sklearn import preprocessing le = preprocessing.LabelEncoder()",No,4,22.0 "X_Train.Country = le.fit_transform(X_Train.Country) X_Train['State'] = le.fit_transform(X_Train['State']) X_Train.head()",No,5,20.0 "X_Test.Country = le.fit_transform(X_Test.Country) X_Test['State'] = le.fit_transform(X_Test['State']) X_Test.head()",No,5,20.0 "df_train.loc[df_train.Country == 'Afghanistan', :]",No,5,14.0 df_test.tail(),No,5,41.0 "from warnings import filterwarnings filterwarnings('ignore')",No,5,23.0 le = preprocessing.LabelEncoder(),No,5,20.0 "from xgboost import XGBRegressor import lightgbm as lgb",No,5,22.0 "countries = X_Train.Country.unique() df_out = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []}) df_out2 = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []}) for country in countries: states = X_Train.loc[X_Train.Country == country, :].State.unique() #print(country, states) # check whether string is nan or not for state in states: X_Train_CS = X_Train.loc[(X_Train.Country == country) & (X_Train.State == state), ['State', 'Country', 'Date', 'ConfirmedCases', 'Fatalities']] y1_Train_CS = X_Train_CS.loc[:, 'ConfirmedCases'] y2_Train_CS = X_Train_CS.loc[:, 'Fatalities'] X_Train_CS = X_Train_CS.loc[:, ['State', 'Country', 'Date']] X_Train_CS.Country = le.fit_transform(X_Train_CS.Country) X_Train_CS['State'] = le.fit_transform(X_Train_CS['State']) X_Test_CS = X_Test.loc[(X_Test.Country == country) & (X_Test.State == state), ['State', 'Country', 'Date', 'ForecastId']] X_Test_CS_Id = X_Test_CS.loc[:, 'ForecastId'] X_Test_CS = X_Test_CS.loc[:, ['State', 'Country', 'Date']] X_Test_CS.Country = le.fit_transform(X_Test_CS.Country) X_Test_CS['State'] = le.fit_transform(X_Test_CS['State']) # XGBoost model1 = XGBRegressor(n_estimators=2000) model1.fit(X_Train_CS, y1_Train_CS) y1_pred = model1.predict(X_Test_CS) model2 = XGBRegressor(n_estimators=2000) model2.fit(X_Train_CS, y2_Train_CS) y2_pred = model2.predict(X_Test_CS) # LightGBM model3 = lgb.LGBMRegressor(n_estimators=2000) model3.fit(X_Train_CS, y1_Train_CS) y3_pred = model3.predict(X_Test_CS) model4 = lgb.LGBMRegressor(n_estimators=2000) model4.fit(X_Train_CS, y2_Train_CS) y4_pred = model4.predict(X_Test_CS) df = pd.DataFrame({'ForecastId': X_Test_CS_Id, 'ConfirmedCases': y1_pred, 'Fatalities': y2_pred}) df2 = pd.DataFrame({'ForecastId': X_Test_CS_Id, 'ConfirmedCases': y3_pred, 'Fatalities': y4_pred}) df_out = pd.concat([df_out, df], axis=0) df_out2 = pd.concat([df_out2, df2], axis=0) # Done for state loop # Done for country Loop",No,3,7.0 "df_out.ForecastId = df_out.ForecastId.astype('int') df_out2.ForecastId = df_out2.ForecastId.astype('int')",No,5,16.0 "df_out['ConfirmedCases'] = (1/2)*(df_out['ConfirmedCases'] + df_out2['ConfirmedCases']) df_out['Fatalities'] = (1/2)*(df_out['Fatalities'] + df_out2['Fatalities'])",No,5,8.0 "df_out['ConfirmedCases'] = df_out['ConfirmedCases'].round().astype(int) df_out['Fatalities'] = df_out['Fatalities'].round().astype(int)",No,5,16.0 df_out.tail(),No,5,41.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) pd.plotting.register_matplotlib_converters() import matplotlib.pyplot as plt import seaborn as sns # Any results you write to the current directory are saved as output.",No,5,88.0 "train_df = pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"") test_df = pd.read_csv(""../input/covid19-global-forecasting-week-2/test.csv"")",No,5,45.0 train_df.head(50),No,5,41.0 "''' from pandas_profiling import ProfileReport train_profile = ProfileReport(train_df, title='Pandas Profiling Report', html={'style':{'full_width':True}}) train_profile '''",No,3,22.0 train_df.info(),No,5,40.0 "common_value = ""UNKNOWN"" # Replacing all the Province_State that are null by the Country_Region values train_df.Province_State.fillna(train_df.Country_Region, inplace=True) test_df.Province_State.fillna(test_df.Country_Region, inplace=True) # Handling the Date column # 1. Converting the object type column into datetime type train_df.Date = train_df.Date.apply(pd.to_datetime) test_df.Date = test_df.Date.apply(pd.to_datetime) # 2. Creating new features #train_df['ReportDay_year'] = train_df['Date'].dt.year #Not required this column because all the data is of this year train_df['ReportDay_month'] = train_df['Date'].dt.month train_df['ReportDay_week'] = train_df['Date'].dt.week train_df['ReportDay_day'] = train_df['Date'].dt.day #test_df['ReportDay_year'] = test_df['Date'].dt.year test_df['ReportDay_month'] = test_df['Date'].dt.month test_df['ReportDay_week'] = test_df['Date'].dt.week test_df['ReportDay_day'] = test_df['Date'].dt.day'",No,4,8.0 "#Dropping the date column train_df.drop(""Date"", inplace = True, axis = 1) test_df.drop(""Date"", inplace = True, axis = 1)",No,5,10.0 train_df.Province_State.value_counts(),No,5,72.0 " from sklearn.preprocessing import LabelEncoder le = LabelEncoder() train_df.Country_Region = le.fit_transform(train_df.Country_Region) train_df['Province_State'] = le.fit_transform(train_df['Province_State']) test_df.Country_Region = le.fit_transform(test_df.Country_Region) test_df['Province_State'] = le.fit_transform(test_df['Province_State']) ",No,5,20.0 \,No,3,20.0 "''' # Removing duplicate entries train_df = train_df.loc[:,~train_df.columns.duplicated()] test_df = test_df.loc[:,~test_df.columns.duplicated()] print (test_df.shape) '''",No,4,19.0 "''' # Dropping the object type columns train_df.drop(objList, axis=1, inplace=True) test_df.drop(objList, axis=1, inplace=True) print (train_df.shape) '''",No,4,10.0 \,No,5,71.0 test_df.info(),No,5,40.0 "X_train = train_df.drop([""Id"", ""ConfirmedCases"", ""Fatalities""], axis = 1) Y_train_CC = train_df[""ConfirmedCases""] Y_train_Fat = train_df[""Fatalities""] #X_test = test_df.drop([""ForecastId""], axis = 1) X_test = test_df.drop([""ForecastId""], axis = 1) ",No,5,21.0 \,No,5,28.0 "from sklearn.model_selection import ShuffleSplit, cross_val_score skfold = ShuffleSplit(random_state=7)",No,5,84.0 " #1.Ridge Regression #Model import from sklearn.linear_model import Ridge #train classifier reg_CC = Ridge(alpha=1.0) reg_Fat = Ridge(alpha=1.0) #Cross Validation to calculate the score score_CC = cross_val_score(reg_CC, X_train, Y_train_CC, cv = skfold) score_Fat = cross_val_score(reg_Fat, X_train, Y_train_Fat, cv = skfold) #rmsle_svm = test_model_r2(clf_svm, ""CC"") #Print the scores print (score_CC.mean(), score_Fat.mean()) ",No,5,84.0 " #2.Lasso Regression #Model import from sklearn import linear_model #train classifier reg_CC = linear_model.Lasso(alpha=0.1) reg_Fat = linear_model.Lasso(alpha=0.1) #Cross Validation to calculate the score score_CC = cross_val_score(reg_CC, X_train, Y_train_CC, cv = skfold) score_Fat = cross_val_score(reg_Fat, X_train, Y_train_Fat, cv = skfold) #rmsle_svm = test_model_r2(clf_svm, ""CC"") #Print the scores print (score_CC.mean(), score_Fat.mean()) ",No,5,28.0 " #3. SVM #Model import from sklearn import svm #train classifier reg_CC = svm.SVC() reg_Fat = svm.SVC() #Cross Validation to calculate the score score_CC = cross_val_score(reg_CC, X_train, Y_train_CC, cv = skfold) score_Fat = cross_val_score(reg_Fat, X_train, Y_train_Fat, cv = skfold) #Print the scores print (score_CC.mean(), score_Fat.mean()) ",No,5,28.0 " #3. ElasticNet #Model import from sklearn.linear_model import ElasticNet #train classifier reg_CC = ElasticNet(random_state=0) reg_Fat = ElasticNet(random_state=0) #Cross Validation to calculate the score score_CC = cross_val_score(reg_CC, X_train, Y_train_CC, cv = skfold) score_Fat = cross_val_score(reg_Fat, X_train, Y_train_Fat, cv = skfold) #Print the scores print (score_CC.mean(), score_Fat.mean()) ",No,5,28.0 " #5. LinearRegression #Model import from sklearn.linear_model import LinearRegression #train classifier reg_CC = LinearRegression() reg_Fat = LinearRegression() #Cross Validation to calculate the score score_CC = cross_val_score(reg_CC, X_train, Y_train_CC, cv = skfold) score_Fat = cross_val_score(reg_Fat, X_train, Y_train_Fat, cv = skfold) #Print the scores print (score_CC.mean(), score_Fat.mean()) ",No,5,28.0 \,No,5,28.0 \,No,5,28.0 \,No,3,28.0 \,No,5,28.0 " #5. BaggingClassifier from sklearn.ensemble import BaggingRegressor from sklearn.tree import DecisionTreeRegressor clf_bgr_CC = BaggingRegressor(base_estimator = DecisionTreeRegressor()) clf_bgr_Fat = BaggingRegressor(base_estimator = DecisionTreeRegressor()) rmsle_bgr_CC = test_model(clf_bgr_CC, ""CC"") rmsle_bgr_Fat = test_model(clf_bgr_Fat, ""Fat"") print (rmsle_bgr_CC, rmsle_bgr_Fat) ",No,5,84.0 \,No,4,28.0 "reg_CC.fit(X_train, Y_train_CC) Y_pred_CC = reg_CC.predict(X_test) reg_Fat.fit(X_train, Y_train_Fat) Y_pred_Fat = reg_Fat.predict(X_test) ",No,5,48.0 print (Y_pred_Fat),No,5,53.0 "#Using pd.to_datetime for adding new features df_train['Date'] = pd.to_datetime(df_train['Date']) df_train.insert(1,'Week',df_train['Date'].dt.week) df_train.insert(2,'Day',df_train['Date'].dt.day) df_train.insert(3,'DayofWeek',df_train['Date'].dt.dayofweek) df_train.insert(4,'DayofYear',df_train['Date'].dt.dayofyear) df_test['Date'] = pd.to_datetime(df_test['Date']) df_test.insert(1,'Week',df_test['Date'].dt.week) df_test.insert(2,'Day',df_test['Date'].dt.day) df_test.insert(3,'DayofWeek',df_test['Date'].dt.dayofweek) df_test.insert(4,'DayofYear',df_test['Date'].dt.dayofyear)",No,5,8.0 "# Replacing all the Province_State that are null by the Country_Region values df_train.Province_State.fillna(df_train.Country_Region, inplace=True) df_test.Province_State.fillna(df_test.Country_Region, inplace=True)",No,5,17.0 "from sklearn.preprocessing import LabelEncoder le = LabelEncoder() df_train.Country_Region = le.fit_transform(df_train.Country_Region) df_train['Province_State'] = le.fit_transform(df_train['Province_State']) df_test.Country_Region = le.fit_transform(df_test.Country_Region) df_test['Province_State'] = le.fit_transform(df_test['Province_State']) ",No,5,20.0 "#One Hot Encoding columns def one_hot(df, cols): """""" @param df pandas DataFrame @param cols a list of columns to encode @return a DataFrame with one-hot encoding """""" i = 0 for each in cols: #print (each) dummies = pd.get_dummies(df[each], prefix=each, drop_first= True) if i == 0: print (dummies) i = i + 1 df = pd.concat([df, dummies], axis=1) return df",No,5,20.0 "#Handling categorical data objList = df_train.select_dtypes(include = ""object"").columns df_train = one_hot(df_train, objList) df_test = one_hot(df_test, objList) print (df_train.shape)",No,4,8.0 "#Avoiding duplicated data. df_train = df_train.loc[:,~df_train.columns.duplicated()] df_test = df_test.loc[:,~df_test.columns.duplicated()] print (df_test.shape)",No,4,19.0 "#reading data data = pd.read_csv('../input/covid19-global-forecasting-week-2/train.csv') test_data = pd.read_csv('../input/covid19-global-forecasting-week-2/test.csv') submission = pd.read_csv('../input/covid19-global-forecasting-week-2/submission.csv') print(data.shape) print(test_data.shape) print(submission.shape)",No,4,45.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # Any results you write to the current directory are saved as output.",No,4,22.0 "dftrain = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/train.csv"") dftrain[""GrowthRate""] = dftrain[""ConfirmedCases""] / dftrain.ConfirmedCases.shift(1) print(dftrain.Date.unique()) dftest = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/test.csv"") print(dftest.columns.values) dfsubmission = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/submission.csv"")",No,3,45.0 "growth_per_country = {} means_per_country = {} last_per_country = {} def train(country, region, growth): if growth < 2.0: print(growth) growth_list = growth_per_country.get((country,region), list()) growth_list.append(growth) growth_per_country[(country,region)] = growth_list def predict(country, region): if not (country,region) in means_per_country: means_per_country[(country,region)] = np.mean(growth_per_country[(country,region)]) growth = means_per_country[(country,region)] return growth",No,2,7.0 "## Training for row in dftrain.itertuples(): train(row.Country_Region, row.Province_State, row.GrowthRate)",No,3,7.0 "## Current submission for row in dftest.itertuples(): if(row.ForecastId%100 == 0): print(row.ForecastId) if type(row.Province_State)!=str: dfnow = dftrain[dftrain.Country_Region == row.Country_Region] else: dfnow = dftrain[dftrain.Country_Region == row.Country_Region][dftrain.Province_State == row.Province_State] filterDate = dfnow[""Date""].isin([row.Date]) if len(dfnow[filterDate].values) == 0: growth = predict(row.Country_Region, row.Province_State) pred = pred * growth predfat = predfat * abs(growth - 0.1) else: pred = dfnow[filterDate][""ConfirmedCases""].values[0] predfat = dfnow[filterDate][""Fatalities""].values[0] dfsubmission.at[row.ForecastId-1, ""ConfirmedCases""] = int(pred) dfsubmission.at[row.ForecastId-1, ""Fatalities""] = int(predfat) dfsubmission dfsubmission.to_csv('submission.csv', index=False) # end, the rest is experimental code'",No,3,27.0 print(dfsubmission.head(60)),No,5,41.0 "dfsubmission[""Date""] = dftest[""Date""] dfsub = dfsubmission[dftest.Country_Region == 'Ireland'][dftest.Province_State.isnull()] dfsub[:20].plot(""Date"", ""ConfirmedCases"")'",No,3,33.0 "df = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/train.csv"") df.head(40) print(df.columns.values) df[""GrowthRate""] = df[""ConfirmedCases""] / df.ConfirmedCases.shift(1) df[""Growth""] = df[""ConfirmedCases""] - df.ConfirmedCases.shift(1) df[""PredictedCasesByRate""] = df[""ConfirmedCases""].shift(1) * df.GrowthRate.shift(1) df[""PredictedCases""] = df[""ConfirmedCases""].shift(1) + df.Growth.shift(1) df[""ErrorByRate""] = (df.ConfirmedCases-df.PredictedCasesByRate)/df.ConfirmedCases df[""Error""] = (df.ConfirmedCases - df.PredictedCases)/df.ConfirmedCases df[""FGrowth""] = df[""Fatalities""]/df.Fatalities.shift(1) print(df.head()) dff = df[df.Country_Region == 'Italy'][df.Province_State.isnull()][df.ConfirmedCases >= 100] dff.plot(""Date"", [""ConfirmedCases"",""PredictedCases"", ""PredictedCasesByRate""]) dff.plot(""Date"", [""ErrorByRate"", ""Error""]) dff.plot(""Date"", ""GrowthRate"") dff.head(15)'",No,5,53.0 "dftest = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/test.csv"") dftest.head(40)",No,4,45.0 "dfg = df.groupby([df.Country_Region, df.Province_State]) dfg.head()",No,3,60.0 "import numpy as np import pandas as pd df = pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"") df.shape",No,3,45.0 "loc_group = [""Province_State"", ""Country_Region""] def preprocess(df): df[""Date""] = df[""Date""].astype(""datetime64[ms]"") for col in loc_group: df[col].fillna(""none"", inplace=True) return df df = preprocess(df) sub_df = preprocess(pd.read_csv(""../input/covid19-global-forecasting-week-2/test.csv"")) df.head()",No,4,45.0 "# Dropping the object type columns df_train.drop(objList, axis=1, inplace=True) df_test.drop(objList, axis=1, inplace=True) print (df_train.shape)",No,4,10.0 "df[""Date""].min(), df[""Date""].max()",No,5,40.0 "TARGETS = [""ConfirmedCases"", ""Fatalities""] for col in TARGETS: df[col] = np.log1p(df[col])",No,5,8.0 "for col in TARGETS: df[""prev_{}"".format(col)] = df.groupby(loc_group)[col].shift()",No,5,60.0 "df = df[df[""Date""] > df[""Date""].min()].copy() df.head()",No,4,14.0 df_train,No,5,41.0 "X = df_train.drop(['Date', 'ConfirmedCases', 'Fatalities'], axis=1) y = df_train[['ConfirmedCases', 'Fatalities']]",No,5,21.0 "from sklearn.linear_model import LinearRegression from sklearn.linear_model import BayesianRidge from sklearn.neighbors import KNeighborsRegressor from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score, KFold from sklearn.metrics import make_scorer, r2_score, mean_squared_log_error from sklearn.ensemble import BaggingRegressor",No,5,22.0 "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)",No,5,13.0 y_train.head(),No,5,41.0 "n_folds = 5 cv = KFold(n_splits = 5, shuffle=True, random_state=42).get_n_splits(X_train.values)",No,5,84.0 "def predict_scores(reg_alg): r2 = make_scorer(r2_score) m = reg_alg() m.fit(X_train, y_train['ConfirmedCases']) y_pred = m.predict(X_test) m_r = cross_val_score(m, X_train, y_train['ConfirmedCases'], cv=cv, scoring = r2) sc_Cases.append(m_r) m.fit(X_train, y_train['Fatalities']) y_pred = m.predict(X_test) m_r2 = cross_val_score(m, X_train, y_train['Fatalities'], cv=cv, scoring = r2) sc_Fatalities.append(m_r2) reg_models = [KNeighborsRegressor, LinearRegression, RandomForestRegressor, GradientBoostingRegressor, DecisionTreeRegressor,BayesianRidge, BaggingRegressor] sc_Cases = [] sc_Fatalities = [] for x in reg_models: predict_scores(x)",No,5,3.0 sc_Cases,No,5,53.0 sc_Fatalities,No,5,53.0 from sklearn.ensemble import BaggingRegressor,No,5,22.0 " #Hyperparameter tuning from sklearn.model_selection import RandomizedSearchCV,GridSearchCV param_grid = { 'n_estimators':[10, 30, 50, 100,250,500,750,1000,1250,1500,1750], 'max_samples':[2,4,6,8,10,20,40,60,100], ""max_features"": [0.5, 1.0], 'n_jobs':[-2, -1, 1, 2, 3, 4, 5], ""bootstrap_features"": [True, False] } '''param_grid = {""criterion"": [""mae""], ""min_samples_split"": [10, 20, 40], ""max_depth"": [2, 6, 8], ""min_samples_leaf"": [20, 40, 100], ""max_leaf_nodes"": [5, 20, 100], }''' asdf = BaggingRegressor() clf_CC = RandomizedSearchCV(asdf, param_grid ) clf_Fat = RandomizedSearchCV(asdf, param_grid ) clf_CC.fit(X_train, y_train['ConfirmedCases']) clf_Fat.fit(X_train, y_train['Fatalities']) '",No,5,6.0 "model1 = clf_CC model1.fit(X_train, y_train['ConfirmedCases']) model2 = clf_Fat model2.fit(X_train, y_train['Fatalities'])",No,5,7.0 "df_test['ConfirmedCases'] = model1.predict(df_test.drop(['Date', 'ForecastId'], axis=1)) df_test['Fatalities'] = model2.predict(df_test.drop(['Date', 'ForecastId', 'ConfirmedCases'], axis=1))",No,5,48.0 "import warnings warnings.filterwarnings('ignore') df_results = df_test[['ForecastId', 'ConfirmedCases', 'Fatalities']] df_results['ConfirmedCases'] = df_results['ConfirmedCases'].astype(int) df_results['Fatalities'] = df_results['Fatalities'].astype(int) df_results.head()",No,4,16.0 "df_results.to_csv('submission.csv', index=False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os import multiprocessing for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') # e.g. 4015976448 mem_gib = mem_bytes/(1024.**3) # e.g. 3.74 print(""RAM: %f GB"" % mem_gib) print(""CORES: %d"" % multiprocessing.cpu_count()) # Any results you write to the current directory are saved as output.",No,5,88.0 "import plotly.graph_objects as go import matplotlib.pyplot as plt from tqdm import tqdm import time from datetime import datetime from pathlib import Path from sklearn import preprocessing import keras.backend as K from keras.models import Sequential from keras.layers import Dense, LSTM, Dropout, GRU from keras.callbacks import EarlyStopping from sklearn.preprocessing import StandardScaler, MinMaxScaler",No,5,22.0 "train = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/train.csv"") test = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/test.csv"") train.tail()",No,4,45.0 test.tail(),No,5,41.0 train.info(),No,5,40.0 "train['Date'] = pd.to_datetime(train['Date']) test['Date'] = pd.to_datetime(test['Date']) train['Country_Region'] = train['Country_Region'].astype(str) # train['Province_State'] = train['Province_State'].astype(str) test['Country_Region'] = test['Country_Region'].astype(str) # test['Province_State'] = test['Province_State'].astype(str)",No,5,16.0 "EMPTY_VAL = ""EMPTY_VAL"" def fillState(state, country): if state == EMPTY_VAL: return country return state train['Province_State'].fillna(EMPTY_VAL, inplace=True) train['Province_State'] = train.loc[:, ['Province_State', 'Country_Region']].apply(lambda x : fillState(x['Province_State'], x['Country_Region']), axis=1) test['Province_State'].fillna(EMPTY_VAL, inplace=True) test['Province_State'] = test.loc[:, ['Province_State', 'Country_Region']].apply(lambda x : fillState(x['Province_State'], x['Country_Region']), axis=1)'",No,5,17.0 "le = preprocessing.LabelEncoder() train['country_encoder'] = le.fit_transform(train['Country_Region']) train['date_int'] = train['Date'].apply(lambda x: datetime.strftime(x, '%m%d')).astype(int) test['country_encoder'] = le.transform(test['Country_Region']) test['date_int'] = test['Date'].apply(lambda x: datetime.strftime(x, '%m%d')).astype(int)",No,4,20.0 "le = preprocessing.LabelEncoder() train['province_encoder'] = le.fit_transform(train['Province_State']) test['province_encoder'] = le.transform(test['Province_State'])",No,5,20.0 "#TODO: takes 44m ish, consider multi-processing, multi-cores, run in GPU #TODO: create data_generate func from joblib import parallel_backend start_time = time.time() country = train['Country_Region'].drop_duplicates() train_df = train.copy() train_df.rename(columns={'Date': 'date', 'ConfirmedCases': 'cc_cases', 'Fatalities': 'ft_cases', 'Country_Region': 'country', 'Province_State': 'province'}, inplace=True) lags = np.arange(1,8,1) # lag of 1 to 7 with parallel_backend('threading', n_jobs = -1): with tqdm(total = len(list(train_df['date'].unique()))) as pbar: for d in train_df['date'].drop_duplicates(): for i in country: province = train_df[train_df['country'] == i]['province'].drop_duplicates() for j in province: mask = (train_df['date'] == d) & (train_df['country'] == i) & (train_df['province'] == j) for lag in lags: mask_org = (train_df['date'] == (d - pd.Timedelta(days=lag))) & (train_df['country'] == i) & (train_df['province'] == j) try: train_df.loc[mask, 'cc_cases_' + str(lag)] = train_df.loc[mask_org, 'cc_cases'].values except: train_df.loc[mask, 'cc_cases_' + str(lag)] = 0 try: train_df.loc[mask, 'ft_cases_' + str(lag)] = train_df.loc[mask_org, 'ft_cases'].values except: train_df.loc[mask, 'ft_cases_' + str(lag)] = 0 pbar.update(1) print('Time spent for building features is {} minutes'.format(round((time.time()-start_time)/60,1)))",No,4,8.0 "# train_df.to_csv(Path('/kaggle/working', 'train_df.csv')) # saved locally, reload it train_df = pd.read_csv(Path('/kaggle/working/', 'train_df.csv'), index_col = 0, parse_dates = ['date']) train_df[train_df['country'] == 'Italy'].tail()",No,4,45.0 "#TODO: walk forward validation def split_train_val(df, val_ratio): val_len = int(len(df) * val_ratio) train_set = df[:-val_len] val_set = df[-val_len:] return train_set, val_set",No,5,13.0 "test_fixed_cols = ['ForecastId', 'Province_State', 'Country_Region', 'Date'] fixed_cols = ['Id', 'province', 'country', 'date'] output_cols = ['cc_cases', 'ft_cases'] input_cols = list(set(train_df.columns.to_list()) - set(fixed_cols) - set(output_cols)) print('output columns are ', output_cols) print('input columns are ', input_cols) X = train_df[input_cols] y = train_df[output_cols]",No,5,21.0 "# split to cumulative and fatal features and build 2 separate models # split to train and validation set cc_input = ['cc_cases_1', 'cc_cases_2', 'cc_cases_3', 'cc_cases_4', 'cc_cases_5', 'cc_cases_6', 'cc_cases_7', 'country_encoder', 'province_encoder', 'date_int'] ft_input = ['ft_cases_1', 'ft_cases_2', 'ft_cases_3', 'ft_cases_4', 'ft_cases_5', 'ft_cases_6', 'ft_cases_7', 'country_encoder', 'province_encoder', 'date_int'] cc_output = ['cc_cases'] ft_output = ['ft_cases'] X_cc = X[cc_input] X_ft = X[ft_input] y_cc = y[cc_output] y_ft = y[ft_output] train_X_cc, val_X_cc = split_train_val(df = X_cc, val_ratio = 0.1) train_y_cc, val_y_cc = split_train_val(df = y_cc, val_ratio = 0.1) train_X_ft, val_X_ft = split_train_val(df = X_ft, val_ratio = 0.1) train_y_ft, val_y_ft = split_train_val(df = y_ft, val_ratio = 0.1)",No,5,13.0 "idx = np.random.RandomState(seed=42).permutation(train_X_cc.index) train_X_cc = train_X_cc.reindex(idx) train_y_cc = train_y_cc.reindex(idx) train_X_ft = train_X_ft.reindex(idx) train_y_ft = train_y_ft.reindex(idx) # train_y_cc.tail()",No,5,15.0 "# normalization X_scaler_cc = MinMaxScaler() X_train_cc = X_scaler_cc.fit_transform(train_X_cc) X_val_cc = X_scaler_cc.transform(val_X_cc) # intput/output 2D array-like y_scaler_cc = MinMaxScaler() y_train_cc = y_scaler_cc.fit_transform(train_y_cc) y_val_cc = y_scaler_cc.transform(val_y_cc) # array-like",No,5,18.0 "X_scaler_ft = MinMaxScaler() X_train_ft = X_scaler_ft.fit_transform(train_X_ft) X_val_ft = X_scaler_ft.transform(val_X_ft) # intput/output 2D array-like y_scaler_ft = MinMaxScaler() y_train_ft = y_scaler_ft.fit_transform(train_y_ft) y_val_ft = y_scaler_ft.transform(val_y_ft) # array-like",No,5,18.0 "print('Validate if train and test is splited correctly for 2 cases: ') print('cumulative cases training has shape ', X_train_cc.shape, y_train_cc.shape) print('fatal cases training has shape ', X_train_ft.shape, y_train_ft.shape) print('cumulative cases valid has shape ', X_val_cc.shape, y_val_cc.shape) print('fatal cases valid has shape ', X_val_ft.shape, y_val_ft.shape) #TODO print('Validate if train and test contains np.nan, np.inf, -np.inf after standardization: ')",No,3,41.0 "# if choose to not apply normalization, however it generates NaN in output... X_train_cc = train_X_cc.to_numpy() X_val_cc = val_X_cc.to_numpy() X_train_ft = train_X_ft.to_numpy() X_val_ft = val_X_ft.to_numpy() y_train_cc = train_y_cc.to_numpy() y_val_cc = val_y_cc.to_numpy() y_train_ft = train_y_ft.to_numpy() y_val_ft = val_y_ft.to_numpy()",No,3,12.0 "# for LSTM, intput.shape = (n_samples, 1, n_features) X_train_cc = X_train_cc.reshape(X_train_cc.shape[0], 1, X_train_cc.shape[1]) X_val_cc = X_val_cc.reshape(X_val_cc.shape[0], 1, X_val_cc.shape[1]) X_train_ft = X_train_ft.reshape(X_train_ft.shape[0], 1, X_train_ft.shape[1]) X_val_ft = X_val_ft.reshape(X_val_ft.shape[0], 1, X_val_ft.shape[1]) print(X_train_cc.shape, X_val_cc.shape, X_train_ft.shape, X_val_ft.shape)",No,3,12.0 "import pandas as pd import datetime import lightgbm as lgb import numpy as np from sklearn import preprocessing",No,4,22.0 "train = pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"") test = pd.read_csv(""../input/covid19-global-forecasting-week-2/test.csv"") sub = pd.read_csv(""../input/covid19-global-forecasting-week-2/submission.csv"")",No,5,45.0 train,No,5,41.0 train = train.append(test[test['Date']>'2020-03-31']),No,5,11.0 "train['Date'] = pd.to_datetime(train['Date'], format='%Y-%m-%d')",No,5,16.0 train['day_dist'] = train['Date']-train['Date'].min(),No,5,8.0 train['day_dist'] = train['day_dist'].dt.days,No,5,8.0 "print(train['Date'].max()) #print(val['Date'].max()) print(test['Date'].min()) print(test['Date'].max()) #print(test['Date'].max()-test['Date'].min())",No,5,40.0 "cat_cols = train.dtypes[train.dtypes=='object'].keys() cat_cols",No,5,77.0 "for cat_col in cat_cols: train[cat_col].fillna('no_value', inplace = True)",No,5,17.0 "train['place'] = train['Province_State']+'_'+train['Country_Region'] #vcheck = train[(train['Date']>='2020-03-12')]",No,5,8.0 "for cat_col in ['place']: #train[cat_col].fillna('no_value', inplace = True) #train[cat_col].value_counts().idxmax() le = preprocessing.LabelEncoder() le.fit(train[cat_col]) train[cat_col]=le.transform(train[cat_col])",No,5,20.0 train.keys(),No,5,40.0 "drop_cols = ['Id','ForecastId', 'ConfirmedCases','Date', 'Fatalities', 'day_dist', 'Province_State', 'Country_Region'] #,'day_dist','shift_22_ft','shift_23_ft','shift_24_ft','shift_25_ft','shift_26_ft']",No,5,77.0 "#val = train[(train['Id']).isnull()==True] #train = train[(train['Id']).isnull()==False] val = train[(train['Date']>='2020-03-12')&(train['Id'].isnull()==False)] #test = train[(train['Date']>='2020-03-12')&(train['Id'].isnull()==True)] #train = train[(train['Date']<'2020-03-22')&(train['Id'].isnull()==False)]",No,5,14.0 val,No,5,53.0 "y_ft = train[""Fatalities""] y_val_ft = val[""Fatalities""] y_cc = train[""ConfirmedCases""] y_val_cc = val[""ConfirmedCases""] #train.drop(drop_cols, axis=1, inplace=True) #test.drop(drop_cols, axis=1, inplace=True) #val.drop(drop_cols, axis=1, inplace=True)",No,5,21.0 "# def rmsle (y_true, y_pred): return np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2)))'",No,5,84.0 "def mape (y_true, y_pred): return np.mean(np.abs(y_pred -y_true)*100/(y_true+1))",No,5,84.0 dates = dates[dates>'2020-03-31'],No,5,14.0 train[train['Date']==date],No,5,14.0 test[test['Country_Region']=='Italy'],No,5,14.0 test[(test['Country_Region']=='China')&(test['Province_State']=='Zhejiang')],No,5,14.0 y_pred.mean(),No,5,40.0 print(len(test)),No,5,58.0 "train_sub = pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"")",No,5,45.0 test.head(),No,5,41.0 test.loc[test['ConfirmedCases_x'].isnull()==True],No,5,14.0 "test.loc[test['ConfirmedCases_x'].isnull()==True, 'ConfirmedCases_x'] =test.loc[test['ConfirmedCases_x'].isnull()==True, 'ConfirmedCases_y']",No,5,14.0 "test.loc[test['Fatalities_x'].isnull()==True, 'Fatalities_x'] = test.loc[test['Fatalities_x'].isnull()==True, 'Fatalities_y']",No,5,14.0 dates,No,5,53.0 "last_amount = test.loc[(test['Country_Region']=='Italy')&(test['Date']=='2020-03-31'),'ConfirmedCases_x']",No,5,14.0 "last_fat = test.loc[(test['Country_Region']=='Italy')&(test['Date']=='2020-03-31'),'Fatalities_x']",No,5,14.0 last_fat.values[0],No,5,41.0 "test.loc[(test['Country_Region']=='Italy')] #&(test['Date']==date),'ConfirmedCases_x' ",No,5,14.0 "for date in dates: k = k-1 i = i+1 test.loc[(test['Country_Region']=='Italy')&(test['Date']==date), 'ConfirmedCases_x']=last_amount.values[0] + i*(5000-(100*i)) test.loc[(test['Country_Region']=='Italy')&(test['Date']==date), 'Fatalities_x'] = last_fat.values[0]+i*(800-(10*i))",No,5,8.0 "last_amount = test.loc[(test['Country_Region']=='China')&(test['Province_State']!='Hubei')&(test['Date']=='2020-03-31'),'ConfirmedCases_x'] last_fat = test.loc[(test['Country_Region']=='China')&(test['Province_State']!='Hubei')&(test['Date']=='2020-03-31'),'Fatalities_x']",No,5,77.0 "i = 0 k = 30 for date in dates: k = k-1 i = i+1 test.loc[(test['Country_Region']=='China')&(test['Province_State']!='Hubei')&(test['Date']==date), 'Fatalities_x']= last_fat.values test.loc[(test['Country_Region']=='China')&(test['Province_State']!='Hubei')&(test['Date']==date), 'ConfirmedCases_x']= last_amount.values + i",No,5,8.0 "last_amount = test.loc[(test['Country_Region']=='China')&(test['Province_State']=='Hubei')&(test['Date']=='2020-03-31'),'ConfirmedCases_x'] last_fat = test.loc[(test['Country_Region']=='China')&(test['Province_State']=='Hubei')&(test['Date']=='2020-03-31'),'Fatalities_x']",No,5,14.0 "k=30 i=0 for date in dates: k = k-1 i = i+1 test.loc[(test['Country_Region']=='China')&(test['Province_State']=='Hubei')&(test['Date']==date),'ConfirmedCases_x']= last_amount.values[0] test.loc[(test['Country_Region']=='China')&(test['Province_State']=='Hubei')&(test['Date']==date),'Fatalities_x']= last_fat.values[0] + i ",No,5,8.0 sub,No,5,41.0 "sub.loc[sub['ConfirmedCases']<0,'ConfirmedCases']=0",No,5,8.0 "sub.loc[sub['Fatalities']<0, 'Fatalities']=0",No,5,14.0 sub['Fatalities'].describe(),No,5,40.0 sub['ConfirmedCases'].describe(),No,5,40.0 "sub.to_csv('submission.csv',index=False)",No,5,25.0 "# customize loss function which is aligned with kaggle evaluation def root_mean_squared_log_error(y_true, y_pred): return K.sqrt(K.mean(K.square(K.log(y_pred + 1) - K.log(y_true + 1)))) ",No,3,28.0 "#declaring only one model def GRU_model(n_1, input_dim, output_dim): model = Sequential() model.add(GRU(n_1,input_shape=(1, input_dim), activation='relu')) model.add(Dropout(0.1)) model.add(Dense(output_dim, activation='relu')) model.compile(loss=root_mean_squared_log_error, optimizer='adam') print(model.summary()) return model",No,5,4.0 "#TODO: debug sometimes it's getting inf. Suspect bad input model_cc = GRU_model(4, X_train_cc.shape[-1], y_train_cc.shape[-1]) model_ft = GRU_model(4, X_train_ft.shape[-1], y_train_ft.shape[-1]) early_stop = EarlyStopping(monitor='loss', patience=5, verbose=0, mode='min')",No,5,4.0 "groups_dict =dfg.groups for group, indexes in groups_dict.items(): print(group) tempdf = df.loc[indexes[0]:indexes[-1]] print(tempdf.shape) if False: tempdf[""Growth""] = tempdf.ConfirmedCases/tempdf.ConfirmedCases.shift(1) tempdf[""FGrowth""] = tempdf.Fatalities/tempdf.Fatalities.shift(1) tempdf.plot(""Date"", [""Growth"",""FGrowth""])",No,2,21.0 "b""dfa = df[df.Country_Region == 'Spain'][df.Province_State.isnull()][df.ConfirmedCases>10]\nprint(dftesta.shape)\nprint(dfa.shape)\n\nfrom matplotlib import pyplot\nfrom statsmodels.tsa.ar_model import AR\nfrom sklearn.metrics import mean_squared_error\n\nX = list(dfa.GrowthRate.values)\n\nX = [x for x in X if not np.isnan(x) and not np.isinf(x)]\nprint(len(X))\n\ntrain, test = X[:len(X)-6], X[len(X)-6:len(X)]\nprint(len(train))\n# train autoregression\nmodel = AR(train)\nmodel_fit = model.fit()\nwindow = model_fit.k_ar\ncoef = model_fit.params\n# walk forward over time steps in test\nhistory = train[len(train)-window:]\nhistory = [history[i] for i in range(len(history))]\npredictions = list()\nfor t in range(len(test)+31):\n\tlength = len(history)\n\tlag = [history[i] for i in range(length-window,length)]\n\tyhat = coef[0]\n\tfor d in range(window):\n\t\tyhat += coef[d+1] * lag[window-d-1]\n\tif t >= len(test):\n\t\ttest.append(yhat)\n\tobs = test[t]\n\tpredictions.append(yhat)\n\thistory.append(obs)\n\tprint('predicted=%f, expected=%f' % (yhat, obs))\nerror = mean_squared_error(test, predictions)\nprint('Test MSE: %.3f' % error)\n# plot\npyplot.plot(train+test)\npyplot.plot(train+predictions, color='red')\npyplot.show()""",No,4,8.0 "b""dfa = df[df.Country_Region == 'Italy'][df.Province_State.isnull()][df.ConfirmedCases>10]\nprint(dftesta.shape)\nprint(dfa.shape)\n\nfrom matplotlib import pyplot\nfrom statsmodels.tsa.ar_model import AR\nfrom sklearn.metrics import mean_squared_error\n\nX = list(dfa.GrowthRate.values)\n\nX = [x for x in X if not np.isnan(x) and not np.isinf(x)]\nprint(len(X))\n\ntrain, test = X[10:len(X)-6], X[len(X)-6:len(X)]\nprint(len(train))\n# train autoregression\nmodel = AR(train)\nmodel_fit = model.fit()\nprint('Lag: %s' % model_fit.k_ar)\nprint('Coefficients: %s' % model_fit.params)\n# make predictions\npredictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False)\nfor i in range(len(predictions)):\n\tprint('predicted=%f, expected=%f' % (predictions[i], test[i]))\nerror = mean_squared_error(test, predictions)\nprint('Test MSE: %.3f' % error)\n# plot results\npyplot.plot(train[:15]+test)\npyplot.plot(train[:15]+list(predictions), color='red')\npyplot.show()""",No,4,7.0 "#dfsubmission.to_csv('submission.csv', index=False) ",No,4,7.0 "import pandas as pd import matplotlib.pyplot as plt import numpy as np import seaborn as sns from sklearn.preprocessing import LabelEncoder from skmultilearn.problem_transform import BinaryRelevance from sklearn.naive_bayes import GaussianNB",No,1,45.0 import pandas as pd,No,5,22.0 "train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv') test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv')",No,5,45.0 test.info(),No,5,40.0 train.sample(3),No,5,41.0 "train['Date'] = pd.to_datetime(train['Date']) test['Date'] = pd.to_datetime(test['Date'])",No,5,16.0 "train['Date'] = train['Date'].astype('int64') test['Date'] = test['Date'].astype('int64')",No,5,16.0 "from sklearn.preprocessing import LabelEncoder le = LabelEncoder() def FunLabelEncoder(df): for c in df.columns: if df.dtypes[c] == object: le.fit(df[c].astype(str)) df[c] = le.transform(df[c].astype(str)) return df",No,5,20.0 "train.iloc[:,:-2].sample(3)",No,5,41.0 "X = train.iloc[:,:-2] print(X.shape) X.sample(3)",No,3,41.0 "Y = train.iloc[:,-2:] print(Y.shape) Y.sample(3)",No,3,41.0 "from sklearn.model_selection import train_test_split trainX , valX, trainY, valY = train_test_split(X, Y, random_state=1)",No,5,13.0 "y1Train = trainY.iloc[:,0] print(y1Train.shape) y1Train.sample(3)",No,3,41.0 "y2Train = trainY.iloc[:,1] y2Train.sample(3)",No,5,41.0 "y1Val = valY.iloc[:,0] y1Val.sample(3)",No,5,41.0 "y2Val = valY.iloc[:,1] y2Val.sample(3)",No,4,41.0 "print(trainX.shape) trainX.sample(3)",No,4,41.0 print(trainX.info()),No,5,40.0 "trainX.iloc[:,1:].sample(3)",No,5,41.0 "from sklearn.tree import DecisionTreeRegressor lrModel1 = DecisionTreeRegressor(random_state = 27) %time lrModel1.fit(trainX.iloc[:,1:], y1Train)",No,3,4.0 "%time y1Pred = lrModel1.predict(valX.iloc[:,1:]) print(y1Pred[:,])",No,4,27.0 "from sklearn.metrics import mean_absolute_error print(""Accuracy in train set : "", lrModel1.score(trainX.iloc[:,1:], y1Train)) print(""RMSE : "", mean_absolute_error(y1Val, y1Pred)**(0.5))",No,4,28.0 "print(test.shape) test.sample(3)",No,3,41.0 "test.iloc[:,1:].sample(3)",No,5,41.0 "%time finalPred1 = lrModel1.predict(test.iloc[:,1:]) print(finalPred1[:,])",No,5,48.0 "%time finalPred2 = lrModel2.predict(test.iloc[:,1:]) print(finalPred2[:,])",No,5,48.0 "outputFile = pd.DataFrame({""ForecastId"": test.ForecastId, ""ConfirmedCases"": (finalPred1+0.5).astype('int'), ""Fatalities"": (finalPred2+0.5).astype('int')})'",No,5,12.0 outputFile.sample(3),No,5,41.0 "outputFile.to_csv(""submission.csv"", index=False)",No,5,25.0 "import plotly.express as px import plotly.graph_objs as go from plotly.subplots import make_subplots import plotly plotly.offline.init_notebook_mode() # For not show up chart error import matplotlib.pyplot as plt import matplotlib.animation as animation from IPython.display import HTML %matplotlib inline from tqdm import tqdm def RMSLE(pred,actual): return np.sqrt(np.mean(np.power((np.log(pred+1)-np.log(actual+1)),2)))",No,4,22.0 "# Fix error in train data FirstDate = train.groupby('Country_Region').min()['Date'].unique()[0] train['Last Confirm'] = train['ConfirmedCases'].shift(1) while train[(train['Last Confirm'] > train['ConfirmedCases']) & (train['Date'] > FirstDate)].shape[0] > 0: train['Last Confirm'] = train['ConfirmedCases'].shift(1) train['Last Fatalities'] = train['Fatalities'].shift(1) train.loc[(train['Last Confirm'] > train['ConfirmedCases']) & (train['Date'] > FirstDate),'ConfirmedCases'] = train.loc[(train['Last Confirm'] > train['ConfirmedCases']) & (train['Date'] > FirstDate),'Last Confirm'] train.loc[(train['Last Fatalities'] > train['Fatalities']) & (train['Date'] > FirstDate),'Fatalities'] = train.loc[(train['Last Fatalities'] > train['Fatalities']) & (train['Date'] > FirstDate),'Last Fatalities'] train['Last Confirm'] = train['ConfirmedCases'].shift(1) train['Last Fatalities'] = train['Fatalities'].shift(1)",No,5,8.0 "RMSLE(df_val[(df_val['ConfirmedCases'].isnull() == False)]['ConfirmedCases'].values,df_val[(df_val['ConfirmedCases'].isnull() == False)]['ConfirmedCases_hat'].values)",No,5,49.0 "RMSLE(df_val[(df_val['Fatalities'].isnull() == False)]['Fatalities'].values,df_val[(df_val['Fatalities'].isnull() == False)]['Fatalities_hat'].values)",No,5,49.0 "val_score = [] for country in df_val['Country_Region'].unique(): df_val_country = df_val[(df_val['Country_Region'] == country) & (df_val['Fatalities'].isnull() == False)] val_score.append([country, RMSLE(df_val_country['ConfirmedCases'].values,df_val_country['ConfirmedCases_hat'].values),RMSLE(df_val_country['Fatalities'].values,df_val_country['Fatalities_hat'].values)]) df_val_score = pd.DataFrame(val_score) df_val_score.columns = ['Country','ConfirmedCases_Scored','Fatalities_Scored'] df_val_score.sort_values('ConfirmedCases_Scored', ascending = False)",No,3,49.0 "country = ""Vietnam"" df_val = df_val_1 df_val[df_val['Country_Region'] == country].groupby(['Date','Country_Region']).sum().reset_index()'",No,2,41.0 "country = ""Vietnam"" df_val = df_val_1 df_country = df_val[df_val['Country_Region'] == country].groupby(['Date','Country_Region']).sum().reset_index() df_train = train[(train['Country_Region'].isin(df_country['Country_Region'].unique())) & (train['ConfirmedCases'] > 0)].groupby(['Date']).sum().reset_index() idx = df_country[((df_country['ConfirmedCases'].isnull() == False) & (df_country['ConfirmedCases'] > 0))].shape[0] fig = px.line(df_country, x=""Date"", y=""ConfirmedCases_hat"", title='Forecast Total Cases of ' + df_country['Country_Region'].values[0]) fig.add_scatter(x=df_train['Date'], y=df_train['ConfirmedCases'], mode='lines', name=""Actual train"", showlegend=True) fig.add_scatter(x=df_country['Date'][0:idx], y=df_country['ConfirmedCases'][0:idx], mode='lines', name=""Actual test"", showlegend=True) fig.show() fig = px.line(df_country, x=""Date"", y=""Fatalities_hat"", title='Forecast Total Fatalities of ' + df_country['Country_Region'].values[0]) fig.add_scatter(x=df_train['Date'], y=df_train['Fatalities'], mode='lines', name=""Actual train"", showlegend=True) fig.add_scatter(x=df_country['Date'][0:idx], y=df_country['Fatalities'][0:idx], mode='lines', name=""Actual test"", showlegend=True) fig.show()'",No,3,33.0 "df_total = df_val.groupby(['Date']).sum().reset_index() df_train = train[(train['Country_Region'].isin(df_val['Country_Region'].unique())) & (train['ConfirmedCases'] > 0)].groupby(['Date']).sum().reset_index() idx = df_total[((df_total['ConfirmedCases'].isnull() == False) & (df_total['ConfirmedCases'] > 0))].shape[0] fig = px.line(df_total, x=""Date"", y=""ConfirmedCases_hat"", title='Total Cases of World Forecast') fig.add_scatter(x=df_train['Date'], y=df_train['ConfirmedCases'], mode='lines', name=""Actual train"", showlegend=True) fig.add_scatter(x=df_total['Date'][0:idx], y=df_total['ConfirmedCases'][0:idx], mode='lines', name=""Actual test"", showlegend=True) fig.show() fig = px.line(df_total, x=""Date"", y=""Fatalities_hat"", title='Total Fatalities of World Forecast') fig.add_scatter(x=df_train['Date'], y=df_train['Fatalities'], mode='lines', name=""Actual train"", showlegend=True) fig.add_scatter(x=df_total['Date'][0:idx], y=df_total['Fatalities'][0:idx], mode='lines', name=""Actual test"", showlegend=True) fig.show()'",No,4,33.0 "import requests from bs4 import BeautifulSoup req = requests.get('https://www.worldometers.info/coronavirus/') soup = BeautifulSoup(req.text, ""lxml"") df_country = soup.find('div',attrs={""id"" : ""nav-tabContent""}).find('table',attrs={""id"" : ""main_table_countries_today""}).find_all('tr') arrCountry = [] for i in range(1,len(df_country)-1): tmp = df_country[i].find_all('td') if (tmp[0].string.find('=(first_predict_date-pd.DateOffset(days=n_in)))& (region_dfs[key]['Date']'2020-03-18'] model_check = pred_df[pred_df['Date']<=test_check['Date'].max()]",No,4,53.0 "np.sqrt(mean_squared_log_error(y_true = test_check[['ConfirmedCases','Fatalities']], y_pred = model_check[['ConfirmedCases','Fatalities']]))",No,5,49.0 "sub = pred_df[['ConfirmedCases','Fatalities']] sub['ForecastId'] = test_data['ForecastId']",No,5,55.0 sub.sample(20),No,5,41.0 "sub.to_csv(""submission.csv"",index=False)",No,5,25.0 "import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns import warnings warnings.filterwarnings(""ignore"")",No,3,22.0 "train = pd.read_csv('../input/covid19-global-forecasting-week-2/train.csv') train.info()",No,4,45.0 train.head(),No,5,41.0 train.tail(),No,5,41.0 train.describe(),No,5,40.0 train.corr(),No,5,40.0 train.isnull().sum(),No,5,39.0 "test = pd.read_csv('../input/covid19-global-forecasting-week-2/test.csv') test.info()",No,3,45.0 test.describe(),No,5,40.0 test.isnull().sum(),No,5,39.0 "#Changing dtype for dates from object to datetime train['Date'] = pd.to_datetime(train['Date']) test['Date'] = pd.to_datetime(test['Date'])",No,5,16.0 train[train['Province_State'].isnull()]['Country_Region'].unique(),No,3,57.0 train[train['Province_State'].notnull()]['Country_Region'].unique(),No,4,57.0 "train['Province_State'] = np.where(train['Province_State'].isnull(), train['Country_Region'], train['Province_State']) test['Province_State'] = np.where(test['Province_State'].isnull(), test['Country_Region'], test['Province_State'])",No,5,17.0 train[train['Province_State'] == 'Diamond Princess'],No,4,41.0 train[train['Province_State'] == 'Diamond Princess']['Country_Region'].unique(),No,4,57.0 df = train.append(test),No,5,11.0 "group = df.groupby(['Province_State', 'Country_Region'])['Date'].count().reset_index() group",No,5,60.0 df[df['Province_State'] == 'Georgia']['Country_Region'].unique(),No,4,57.0 "#Distinguishing Province/State Georgia according to Country/Region df['Province_State'] = np.where((df['Country_Region'] == 'Georgia') & (df['Province_State'] == 'Georgia'), 'Country Georgia', df['Province_State'])",No,5,8.0 "#Viewing the total number of confirmeed cases and fatalities worldwide world = train.groupby('Date')['ConfirmedCases', 'Fatalities'].sum().reset_index() plt.plot(world['Date'], world['ConfirmedCases'], label = 'Confirmed Cases') plt.plot(world['Date'], world['Fatalities'], label = 'Fatalities') plt.legend() plt.title('Total number of Confirmed Cases and Fatalities Worldwide') plt.xticks(rotation = 30) plt.show();",No,5,75.0 "#Plotting the number of confirmed cases and fatalities for each country country = train.groupby('Country_Region')['ConfirmedCases', 'Fatalities'].sum().reset_index() fig = plt.figure(figsize = (15, 25)) ax = fig.add_subplot(111) ax.barh(country['Country_Region'], country['ConfirmedCases'],label = 'Confirmed Cases') ax.barh(country['Country_Region'], country['Fatalities'],label = 'Fatalities') ax.legend() ax.set_title('Total Confirmed Cases and Fatalities by Country');",No,4,33.0 "#Viewing the top 15 countries with the most confirmed cases ranked = country.sort_values(by = 'ConfirmedCases', ascending = False)[:15] ranked",No,5,9.0 "#Plotting confirmed cases and fatalities for the 15 countries with the most cases countries = ['China', 'Italy', 'US', 'Spain', 'Germany', 'Iran', 'France', 'Korea, South', 'United Kingdom', 'Switzerland', 'Netherlands', 'Belgium', 'Austria', 'Turkey', 'Canada'] for c in countries: group = train[train['Country_Region'] == c].groupby('Date')['ConfirmedCases', 'Fatalities'].sum().reset_index() group['ConfirmedCases'].plot(label = 'Confirmed Cases') group['Fatalities'].plot(label = 'Fatalities') plt.legend() plt.title(c) plt.show();",No,4,33.0 "from statsmodels.tsa.seasonal import seasonal_decompose def trends(country, case): group = train[train['Country_Region'] == country].groupby('Date')['ConfirmedCases', 'Fatalities'].sum().reset_index() decomposition = seasonal_decompose(group[case], freq = 3) trend = decomposition.trend seasonal = decomposition.seasonal residual = decomposition.resid plt.subplot(411) plt.plot(group[case], label= case) plt.legend(loc='best') plt.title('Original') plt.subplot(412) plt.plot(trend, label=case) plt.legend(loc='best') plt.title('Trend') plt.subplot(413) plt.plot(seasonal,label=case) plt.legend(loc='best') plt.title('Seasonality') plt.subplot(414) plt.plot(residual, label=case) plt.legend(loc='best') plt.title('Residual') plt.tight_layout();",No,3,33.0 "from statsmodels.tsa.stattools import adfuller def stationarity_test(country, case): timeseries = train[train['Country_Region'] == country].groupby('Date')['ConfirmedCases', 'Fatalities'].sum().reset_index() #Perform Dickey-Fuller test: print('Results of Dickey-Fuller Test:') dftest = adfuller(timeseries[case], autolag='AIC') dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value', '#Lags Used', 'Number of Observations Used']) for key,value in dftest[4].items(): dfoutput['Critical Value (%s)'%key] = value print (dfoutput)",No,5,47.0 "stationarity_test('US', 'ConfirmedCases')",No,5,47.0 "from statsmodels.tsa.arima_model import ARIMA from sklearn.metrics import mean_squared_error",No,5,22.0 "def comb_p_d_q(pVals,dVals,qVals): return [(p,d,q) for p in pVals for d in dVals for q in qVals]",No,5,53.0 "#List of combinations for pdq pdq_results = comb_p_d_q([0,1,2],[0,1,2],[0,1,2]) pdq_results",No,5,53.0 "df.drop_duplicates(subset = ['Date', 'Province_State'], keep = 'last', inplace = True)",No,5,19.0 "from datetime import timedelta TEST_DAYS = 7 TRAIN_LAST = - timedelta(days=TEST_DAYS) TEST_FIRST = sub_df[""Date""].min() TEST_DAYS = (df[""Date""].max() - TEST_FIRST).days + 1 dev_df, test_df = df[df[""Date""] < TEST_FIRST].copy(), df[df[""Date""] >= TEST_FIRST].copy() dev_df.shape, test_df.shape",No,3,14.0 "from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import Pipeline model = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)), ('linear', LinearRegression())]) features = [""prev_{}"".format(col) for col in TARGETS] model.fit(dev_df[features], dev_df[TARGETS]) [mean_squared_error(dev_df[TARGETS[i]], model.predict(dev_df[features])[:, i]) for i in range(len(TARGETS))]'",No,3,22.0 "def rmse(y_true, y_pred): return np.sqrt(mean_squared_error(y_true, y_pred)) def evaluate(df): error = 0 for col in TARGETS: error += rmse(df[col].values, df[""pred_{}"".format(col)].values) return np.round(error/len(TARGETS), 5) def predict(test_df, first_day, num_days, val=False): y_pred = np.clip(model.predict(test_df.loc[test_df[""Date""] == first_day][features]), None, 16) for i, col in enumerate(TARGETS): test_df[""pred_{}"".format(col)] = 0 test_df.loc[test_df[""Date""] == first_day, ""pred_{}"".format(col)] = y_pred[:, i] if val: print(first_day, evaluate(test_df[test_df[""Date""] == first_day])) for d in range(1, num_days): y_pred = np.clip(model.predict(y_pred), None, 16) date = first_day + timedelta(days=d) for i, col in enumerate(TARGETS): test_df.loc[test_df[""Date""] == date, ""pred_{}"".format(col)] = y_pred[:, i] if val: print(date, evaluate(test_df[test_df[""Date""] == date])) return test_df test_df = predict(test_df, TEST_FIRST, TEST_DAYS, val=True) evaluate(test_df)",No,2,27.0 "for col in TARGETS: test_df[col] = np.expm1(test_df[col]) test_df[""pred_{}"".format(col)] = np.expm1(test_df[""pred_{}"".format(col)])",No,5,8.0 "SUB_FIRST = sub_df[""Date""].min() SUB_DAYS = (sub_df[""Date""].max() - sub_df[""Date""].min()).days + 1 sub_df = dev_df.append(sub_df, sort=False) for col in TARGETS: sub_df[""prev_{}"".format(col)] = sub_df.groupby(loc_group)[col].shift() sub_df = sub_df[sub_df[""Date""] >= SUB_FIRST].copy() sub_df[""ForecastId""] = sub_df[""ForecastId""].astype(np.int16) sub_df = predict(sub_df, SUB_FIRST, SUB_DAYS) for col in TARGETS: sub_df[col] = np.expm1(sub_df[""pred_{}"".format(col)]) sub_df.head()",No,4,8.0 "sub_df.to_csv(""submission.csv"", index=False, columns=[""ForecastId""] + TARGETS)",No,5,25.0 "submission = pd.read_csv(""../input/covid19-global-forecasting-week-2/submission.csv"") submission",No,5,45.0 "submission['ConfirmedCases'] = sub_df['ConfirmedCases'] submission['Fatalities'] = sub_df['Fatalities'] submission.to_csv('submission.csv', index=False) submission",No,5,25.0 "import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import os import matplotlib.pyplot as plt",No,5,22.0 "from sklearn.linear_model import ( ElasticNet, ElasticNetCV, Lasso, LassoCV, LinearRegression, LogisticRegression, Ridge, ) from sklearn.ensemble import ( AdaBoostClassifier, GradientBoostingClassifier, GradientBoostingRegressor, RandomForestClassifier, RandomForestRegressor, VotingClassifier, ) from sklearn.model_selection import ( GridSearchCV, KFold, RandomizedSearchCV, cross_val_score, train_test_split, )",No,5,22.0 "import seaborn as sns from sklearn.base import BaseEstimator from xgboost import XGBClassifier, XGBRegressor import hyperopt as hp from hyperopt import STATUS_OK, Trials, fmin, hp, tpe",No,5,22.0 "#Import Date xtrain = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/train.csv"") xtest = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/test.csv"") xsubmission = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/submission.csv"") # view shape of test and train data print(xtrain.shape) print(xtest.shape)",No,4,45.0 "# view head of train data xtrain.head(10)",No,5,41.0 "# view head of test data xtest.head(10)",No,5,41.0 "# view head of submission data xsubmission.head(10)",No,5,41.0 "# date wise value count xtrain['Date'].value_counts()",No,4,54.0 "# create ConfirmedCasesgroup by Province_State statevalue = xtrain.groupby('Province_State').max().ConfirmedCases",No,5,60.0 "# view top state conformed cases in a barplot top_states = statevalue.sort_values(ascending = False).head(10) sns.barplot(x=top_states.index, y=top_states.values) plt.xticks(rotation = 'vertical')",No,5,33.0 "# make data as integer xtrain.ConfirmedCases = xtrain.ConfirmedCases.astype('int64') xtrain.Fatalities = xtrain.Fatalities.astype('int64')",No,5,16.0 "# Date wise confirm case view in an lineplot plt.figure(figsize=(15,6)) sns.lineplot(x=xtrain.Date,y=xtrain.ConfirmedCases,markers=True,style=True) plt.xticks(rotation = 'vertical')",No,5,75.0 "# Date wise Fatalities view in an lineplot plt.figure(figsize=(15,6)) sns.lineplot(x=xtrain.Date,y=xtrain.Fatalities,markers=True,style=True) plt.xticks(rotation = 'vertical')",No,5,33.0 "# ConfirmedCases and Fatalities column groupby Country Region df_xtrain = xtrain.groupby(['Country_Region'])[['ConfirmedCases', 'Fatalities']].max() print(df_xtrain.sort_values(by=['ConfirmedCases','Fatalities'],ascending=False).head(10))",No,4,60.0 "# view countrywise ConfirmedCases and Fatalities in a plot fig,ax = plt.subplots() fig.set_figheight(10) fig.set_figwidth(40) ax.plot(df_xtrain[:29].index.values,df_xtrain[:29].ConfirmedCases, color=""red"", marker=""o"") ax.set_xlabel(""Countries"",fontsize=24) ax.set_ylabel(""Confirmed Cases"",color=""red"",fontsize=24) ax.tick_params(axis = 'both', which = 'major', labelsize = 24,labelrotation=90) ax2=ax.twinx() ax2.plot(df_xtrain[:29].index.values,df_xtrain[:29].Fatalities,color=""blue"",marker=""o"") ax2.set_ylabel(""Fatalities"",color=""blue"",fontsize=24) ax2.tick_params(axis = 'both', which = 'major', labelsize = 24) plt.show()'",No,5,33.0 "# ConfirmedCases and Fatalities data Analysis Exclude China and view in two Plot confirmed_total_date_noChina = xtrain[xtrain['Country_Region']!='China'].groupby(['Date']).agg({'ConfirmedCases':['sum']}) fatalities_total_date_noChina = xtrain[xtrain['Country_Region']!='China'].groupby(['Date']).agg({'Fatalities':['sum']}) total_date_noChina = confirmed_total_date_noChina.join(fatalities_total_date_noChina) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,5)) total_date_noChina.plot(ax=ax1) ax1.set_title(""Global confirmed cases excluding China"", size=13) ax1.set_ylabel(""Number of cases"", size=13) ax1.set_xlabel(""Date"", size=13) fatalities_total_date_noChina.plot(ax=ax2, color='orange') ax2.set_title(""Global deceased cases excluding China"", size=13) ax2.set_ylabel(""Number of cases"", size=13) ax2.set_xlabel(""Date"", size=13)'",No,5,75.0 "#ConfirmedCases and Fatalities data Analysis and Visualization for China confirmed_total_date_China = xtrain[xtrain['Country_Region']=='China'].groupby(['Date']).agg({'ConfirmedCases':['sum']}) fatalities_total_date_China = xtrain[xtrain['Country_Region']=='China'].groupby(['Date']).agg({'Fatalities':['sum']}) total_date_China = confirmed_total_date_China.join(fatalities_total_date_China) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,5)) total_date_China.plot(ax=ax1) ax1.set_title(""China confirmed cases"", size=13) ax1.set_ylabel(""Number of cases"", size=13) ax1.set_xlabel(""Date"", size=13) fatalities_total_date_China.plot(ax=ax2, color='orange') ax2.set_title(""China Fatalities cases"", size=13) ax2.set_ylabel(""Number of cases"", size=13) ax2.set_xlabel(""Date"", size=13)'",No,3,33.0 "#For Itally confirmed_total_date_Italy = xtrain[xtrain['Country_Region']=='Italy'].groupby(['Date']).agg({'ConfirmedCases':['sum']}) fatalities_total_date_Italy = xtrain[xtrain['Country_Region']=='Italy'].groupby(['Date']).agg({'Fatalities':['sum']}) total_date_Italy = confirmed_total_date_Italy.join(fatalities_total_date_Italy) #For Spain confirmed_total_date_Spain = xtrain[xtrain['Country_Region']=='Spain'].groupby(['Date']).agg({'ConfirmedCases':['sum']}) fatalities_total_date_Spain = xtrain[xtrain['Country_Region']=='Spain'].groupby(['Date']).agg({'Fatalities':['sum']}) total_date_Spain = confirmed_total_date_Spain.join(fatalities_total_date_Spain) plt.figure(figsize=(15,10)) plt.subplot(2, 2, 1) total_date_Italy.plot(ax=plt.gca(), title='Italy') plt.ylabel(""Confirmed infection cases"", size=13) plt.subplot(2, 2, 2) total_date_Spain.plot(ax=plt.gca(), title='Spain') plt.ylabel(""Confirmed infection cases"", size=13)'",No,3,33.0 "#For UK confirmed_total_date_UK = xtrain[xtrain['Country_Region']=='United Kingdom'].groupby(['Date']).agg({'ConfirmedCases':['sum']}) fatalities_total_date_UK = xtrain[xtrain['Country_Region']=='United Kingdom'].groupby(['Date']).agg({'Fatalities':['sum']}) total_date_UK = confirmed_total_date_UK.join(fatalities_total_date_UK) #For Singapore confirmed_total_date_Singapore = xtrain[xtrain['Country_Region']=='Singapore'].groupby(['Date']).agg({'ConfirmedCases':['sum']}) fatalities_total_date_Singapore = xtrain[xtrain['Country_Region']=='Singapore'].groupby(['Date']).agg({'Fatalities':['sum']}) total_date_Singapore = confirmed_total_date_Singapore.join(fatalities_total_date_Singapore) plt.figure(figsize=(15,10)) plt.subplot(2, 2, 1) total_date_UK.plot(ax=plt.gca(), title='United Kingdom') plt.ylabel(""Confirmed infection cases"", size=13) plt.subplot(2, 2, 2) total_date_Singapore.plot(ax=plt.gca(), title='Singapore') plt.ylabel(""Confirmed infection cases"", size=13)'",No,5,33.0 "#For Australia confirmed_total_date_Australia = xtrain[xtrain['Country_Region']=='Australia'].groupby(['Date']).agg({'ConfirmedCases':['sum']}) fatalities_total_date_Australia = xtrain[xtrain['Country_Region']=='Australia'].groupby(['Date']).agg({'Fatalities':['sum']}) total_date_Australia = confirmed_total_date_Australia.join(fatalities_total_date_Australia) #For Bangladesh confirmed_total_date_Bangladesh = xtrain[xtrain['Country_Region']=='Bangladesh'].groupby(['Date']).agg({'ConfirmedCases':['sum']}) fatalities_total_date_Bangladesh = xtrain[xtrain['Country_Region']=='Bangladesh'].groupby(['Date']).agg({'Fatalities':['sum']}) total_date_Bangladesh = confirmed_total_date_Bangladesh.join(fatalities_total_date_Bangladesh) plt.figure(figsize=(15,10)) plt.subplot(2, 2, 1) total_date_Australia.plot(ax=plt.gca(), title='Australia') plt.ylabel(""Confirmed infection cases"", size=13) plt.subplot(2, 2, 2) total_date_Bangladesh.plot(ax=plt.gca(), title='Bangladesh') plt.ylabel(""Confirmed infection cases"", size=13)'",No,4,33.0 "pop_italy = 60486683. pop_spain = 46749696. pop_UK = 67784927. pop_singapore = 5837230. total_date_Italy.ConfirmedCases = total_date_Italy.ConfirmedCases/pop_italy*100. total_date_Italy.Fatalities = total_date_Italy.ConfirmedCases/pop_italy*100. total_date_Spain.ConfirmedCases = total_date_Spain.ConfirmedCases/pop_spain*100. total_date_Spain.Fatalities = total_date_Spain.ConfirmedCases/pop_spain*100. total_date_UK.ConfirmedCases = total_date_UK.ConfirmedCases/pop_UK*100. total_date_UK.Fatalities = total_date_UK.ConfirmedCases/pop_UK*100. total_date_Singapore.ConfirmedCases = total_date_Singapore.ConfirmedCases/pop_singapore*100. total_date_Singapore.Fatalities = total_date_Singapore.ConfirmedCases/pop_singapore*100. plt.figure(figsize=(15,10)) plt.subplot(2, 2, 1) total_date_Italy.ConfirmedCases.plot(ax=plt.gca(), title='Italy') plt.ylabel(""Fraction of population infected"") plt.ylim(0, 0.06) plt.subplot(2, 2, 2) total_date_Spain.ConfirmedCases.plot(ax=plt.gca(), title='Spain') plt.ylim(0, 0.06) plt.subplot(2, 2, 3) total_date_UK.ConfirmedCases.plot(ax=plt.gca(), title='United Kingdom') plt.ylabel(""Fraction of population infected"") plt.ylim(0, 0.005) plt.subplot(2, 2, 4) total_date_Singapore.ConfirmedCases.plot(ax=plt.gca(), title='Singapore') plt.ylim(0, 0.005)'",No,5,33.0 "# For Itally confirmed_total_date_Italy = xtrain[(xtrain['Country_Region']=='Italy') & xtrain['ConfirmedCases']!=0].groupby(['Date']).agg({'ConfirmedCases':['sum']}) fatalities_total_date_Italy = xtrain[(xtrain['Country_Region']=='Italy') & xtrain['ConfirmedCases']!=0].groupby(['Date']).agg({'Fatalities':['sum']}) total_date_Italy = confirmed_total_date_Italy.join(fatalities_total_date_Italy) # For Spain confirmed_total_date_Spain = xtrain[(xtrain['Country_Region']=='Spain') & (xtrain['ConfirmedCases']!=0)].groupby(['Date']).agg({'ConfirmedCases':['sum']}) fatalities_total_date_Spain = xtrain[(xtrain['Country_Region']=='Spain') & (xtrain['ConfirmedCases']!=0)].groupby(['Date']).agg({'Fatalities':['sum']}) total_date_Spain = confirmed_total_date_Spain.join(fatalities_total_date_Spain) # For UK confirmed_total_date_UK = xtrain[(xtrain['Country_Region']=='United Kingdom') & (xtrain['ConfirmedCases']!=0)].groupby(['Date']).agg({'ConfirmedCases':['sum']}) fatalities_total_date_UK = xtrain[(xtrain['Country_Region']=='United Kingdom') & (xtrain['ConfirmedCases']!=0)].groupby(['Date']).agg({'Fatalities':['sum']}) total_date_UK = confirmed_total_date_UK.join(fatalities_total_date_UK) # For Australia confirmed_total_date_Australia = xtrain[(xtrain['Country_Region']=='Australia') & (xtrain['ConfirmedCases']!=0)].groupby(['Date']).agg({'ConfirmedCases':['sum']}) fatalities_total_date_Australia = xtrain[(xtrain['Country_Region']=='Australia') & (xtrain['ConfirmedCases']!=0)].groupby(['Date']).agg({'Fatalities':['sum']}) total_date_Australia = confirmed_total_date_Australia.join(fatalities_total_date_Australia) # For Singapore confirmed_total_date_Singapore = xtrain[(xtrain['Country_Region']=='Singapore') & (xtrain['ConfirmedCases']!=0)].groupby(['Date']).agg({'ConfirmedCases':['sum']}) fatalities_total_date_Singapore = xtrain[(xtrain['Country_Region']=='Singapore') & (xtrain['ConfirmedCases']!=0)].groupby(['Date']).agg({'Fatalities':['sum']}) total_date_Singapore = confirmed_total_date_Singapore.join(fatalities_total_date_Singapore) italy = [i for i in total_date_Italy.ConfirmedCases['sum'].values] italy_30 = italy[0:50] spain = [i for i in total_date_Spain.ConfirmedCases['sum'].values] spain_30 = spain[0:50] UK = [i for i in total_date_UK.ConfirmedCases['sum'].values] UK_30 = UK[0:50] singapore = [i for i in total_date_Singapore.ConfirmedCases['sum'].values] singapore_30 = singapore[0:50] ",No,4,12.0 "# Plots plt.figure(figsize=(12,6)) plt.plot(italy_30) plt.plot(spain_30) plt.plot(UK_30) plt.plot(singapore_30) plt.legend([""Italy"", ""Spain"", ""UK"", ""Singapore""], loc='upper left') plt.title(""COVID-19 infections from the first confirmed case"", size=15) plt.xlabel(""Days"", size=13) plt.ylabel(""Infected cases"", size=13) plt.ylim(0, 60000) plt.show()'",No,5,33.0 "# Check if there have any null value xtrain.isnull().sum()",No,5,39.0 "# CHANGE TO PD.DATETIME xtrain.Date = pd.to_datetime(xtrain.Date, infer_datetime_format=True) xtest.Date = pd.to_datetime(xtest.Date, infer_datetime_format=True)",No,5,16.0 "# CONCISING THE TRAIN DATASET TO 18TH MARCH 2020. MIN_TEST_DATE = xtest.Date.min() xtrain = xtrain.loc[xtrain.Date < MIN_TEST_DATE, :]",No,4,14.0 "# FILLING MISSING VALUES xtrain.fillna("""", inplace=True) xtest.fillna("""", inplace=True)",No,5,17.0 from statsmodels.tsa.arima_model import ARIMA,No,5,22.0 "# DROPPING COUNTRY REGION AND PROVINCE STATE xtrain.drop(['Country_Region','Province_State'],axis=1,inplace=True) xtest.drop(['Country_Region','Province_State'],axis=1,inplace=True)",No,5,10.0 "# CONVERTING DATE COLUMN TO INTEGER xtrain.loc[:, 'Date'] = xtrain.Date.dt.strftime(""%m%d"") xtest.loc[:, 'Date'] = xtest.Date.dt.strftime(""%m%d"")'",No,5,16.0 "# Region wise Confirmed Cases in LinePlot sns.lineplot(data=xtrain, x=""Date"", y=""ConfirmedCases"", hue=""Region"") plt.show()",No,5,75.0 "# Region wise Fatalities in Line Plot. sns.lineplot(data=xtrain, x=""Date"", y=""Fatalities"", hue=""Region"") plt.show()",No,5,33.0 "# CREATING X AND Y for Train Dataset X1 = xtrain.drop([""ConfirmedCases"", ""Fatalities""], axis=1) X2 = xtrain.drop([""ConfirmedCases"", ""Fatalities""], axis=1) y1 = xtrain[""ConfirmedCases""] y2 = xtrain[""Fatalities""]",No,5,21.0 "# Create TEST 1 AND TEST 2 for Test dataset test_1 = xtest.copy() test_2 = xtest.copy()",No,5,12.0 "for f2 in [""Region""]: me2 = MeanEncoding(f2, C=0.01 * len(X2[f2].unique())) me2.fit(X2, y2) X2 = me2.transform(X2) test_2 = me2.transform(test_2)",No,4,7.0 "for f1 in [""Region""]: me1 = MeanEncoding(f1, C=0.01 * len(X1[f1].unique())) me1.fit(X1, y1) X1 = me1.transform(X1) test_1 = me1.transform(test_1)",No,5,20.0 "# View Test_1 test_1",No,5,53.0 "# View Test_2 test_2",No,5,41.0 "# Load some Basic Library import matplotlib.pyplot as plt from sklearn import model_selection import numpy as np",No,5,22.0 "# FUNCTION FOR COMPARING DIFFERENT REGRESSORS def algorithim_boxplot_comparison( X, y, algo_list=[], random_state=3, scoring=""r2"", n_splits=10 ): results = [] names = [] for algo_name, algo_model in algo_list: kfold = model_selection.KFold( shuffle=True, n_splits=n_splits, random_state=random_state ) cv_results = model_selection.cross_val_score( algo_model, X, y, cv=kfold, scoring=scoring ) results.append(cv_results) names.append(algo_name) msg = ""%s: %s : (%f) %s : (%f) %s : (%f)"" % ( algo_name, ""median"", np.median(cv_results), ""mean"", np.mean(cv_results), ""variance"", cv_results.var(ddof=1), ) print(msg) # boxplot algorithm comparison fig = plt.figure() fig.suptitle(""Algorithm Comparison"") ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.show()",No,5,84.0 "# REGRESSORS lr = LinearRegression(n_jobs=-1) rfr = RandomForestRegressor(random_state=96, n_jobs=-1) gbr = GradientBoostingRegressor(random_state=96) xgbr = XGBRegressor()",No,5,82.0 "df_out = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []}) soln = pd.DataFrame({'ForecastId': test_df.ForecastId, 'ConfirmedCases': Y_pred_CC, 'Fatalities': Y_pred_Fat}) df_out = pd.concat([df_out, soln], axis=0) df_out.ForecastId = df_out.ForecastId.astype('int')",No,3,12.0 "# APPENDING THE REGRESSORS IN A LIST models = [] models.append(('lr',lr)) models.append(('rfr',rfr)) models.append(('gbr',gbr)) models.append(('xgbr',xgbr))",No,5,82.0 "df_out.to_csv('submission.csv', index=False) print(""Your submission was successfully saved!"")'",No,5,25.0 \,No,5,6.0 "def model_eval(case): state = ['Italy'] for s in state: train_ts = train[train['Province_State'] == s][:50] test_ts = train[train['Province_State'] == s][50:] a = 9999 for pdq in pdq_results: try: model = ARIMA(train_ts[case], order = pdq, dates = train_ts['Date'], freq = 'D') model_fit = model.fit() aicval = model_fit.aic if aicval < a: a = aicval param = pdq except: pass model = ARIMA(train_ts[case], order = param, dates = train_ts['Date'], freq = 'D') model_fit = model.fit() model_fit.plot_predict(start = int(len(train_ts) * 0.3), end = int(len(train_ts) * 1.4)) pred = model_fit.forecast(steps = int(len(test_ts)))[0] ",No,3,48.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # Any results you write to the current directory are saved as output. '",No,5,88.0 model_eval('ConfirmedCases'),No,5,53.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.optimize import curve_fit import seaborn as sns from datetime import datetime",No,5,22.0 model_eval('Fatalities'),No,5,53.0 "def model(case): state = df['Province_State'].unique() confirmed = [] for s in state: train_ts = df[df['Province_State'] == s][:57] pred_ts = df[df['Province_State'] == s][57:] a = 9999 for pdq in pdq_results: try: model = ARIMA(train_ts[case], order = pdq, dates = train_ts['Date'], freq = 'D') model_fit = model.fit() aicval = model_fit.aic if aicval < a: a = aicval param = pdq except: pass try: model = ARIMA(train_ts[case], order = param, dates = train_ts['Date'], freq = 'D') model_fit = model.fit() pred = model_fit.forecast(steps = int(len(pred_ts)))[0] confirmed = np.append(confirmed, pred.tolist()) except: confirmed = np.append(confirmed, np.repeat(0, 43)) continue test[case] = confirmed",No,3,48.0 "print('Importing training and test data') train_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv') test_df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv') # Update dataframe train_df['Province_State'] = train_df['Province_State'].fillna('') train_df['Region'] = train_df['Country_Region'] + train_df['Province_State'] test_df['Province_State'] = test_df['Province_State'].fillna('') test_df['Region'] = test_df['Country_Region'] + test_df['Province_State'] regions = train_df.Region.unique() # Match days in train and test train_min_date = train_df[train_df['Region']=='Sweden']['Date'].min() test_min_date = test_df[test_df['Region']=='Sweden']['Date'].min() dt_train_min = datetime.strptime(train_min_date, '%Y-%m-%d') dt_test_min = datetime.strptime(test_min_date, '%Y-%m-%d') test_start_day = dt_test_min.timetuple().tm_yday - dt_train_min.timetuple().tm_yday print(test_start_day) # Extract dataframes for each country train_data = {} test_data = {} for region in regions: train_data[region] = train_df[train_df['Region']==region] train_data[region]['DayNo'] = np.arange(len(train_df[train_df['Region']==region]['Date'])) test_data[region] = test_df[test_df['Region']==region] test_data[region]['DayNo'] = np.arange(test_start_day,test_start_day+len(test_df[test_df['Region']==region]['Date'])) ",No,4,8.0 model('ConfirmedCases'),No,3,48.0 model('Fatalities'),No,3,48.0 "results = test[['ForecastId', 'ConfirmedCases', 'Fatalities']] results.to_csv('submission.csv', index = False)",No,5,25.0 "train_max_date = train_df[train_df['Region']=='Sweden']['Date'].max() dt_train_max = datetime.strptime(train_max_date, '%Y-%m-%d') train_max_day = dt_train_max.timetuple().tm_yday - dt_train_min.timetuple().tm_yday #print(train_max_day) #int(train_data[""Sweden""][train_data[""Sweden""]['DayNo']==69]['ConfirmedCases'].tolist()[0]) #test_data[""Sweden""]['DayNo']'",No,5,77.0 "def sigmoid(x, a, b, c): return a*np.exp(c*(x-b))/(np.exp(c*(x-b))+1)",No,5,84.0 "import numpy as np import pandas as pd",No,4,22.0 "# HYPEROPT def auc_model(params): params = { ""n_estimators"": int(params[""n_estimators""]), ""max_features"": int(params[""max_features""]), ""min_samples_leaf"": int(params[""min_samples_leaf""]), ""min_samples_split"": int(params[""min_samples_split""]), } clf = RandomForestRegressor(**params, random_state=96, n_jobs=-1) return cross_val_score( clf, X1, y1, cv=3, scoring=""neg_mean_squared_log_error"" ).mean() params_space = { ""n_estimators"": hp.quniform(""n_estimators"", 0, 300, 50), ""max_features"": hp.quniform(""max_features"", 1, 3, 1), ""min_samples_leaf"": hp.quniform(""min_samples_leaf"", 1, 50, 1), ""min_samples_split"": hp.quniform(""min_samples_split"", 1, 50, 1), } best = 0 def f(params): global best auc = auc_model(params) if auc > best: print(""New Best"", best, params) return {""loss"": -auc, ""status"": STATUS_OK} trials = Trials() best = fmin(f, params_space, algo=tpe.suggest, max_evals=200, trials=trials) print(""best:\ "", best)'",No,4,1.0 "failed_confirmed = [] failed_fatalities = [] confirmed_popt = {} fatalities_popt = {} for region in regions: x_data = train_data[region]['DayNo'] y_ConfirmedCases_data = train_data[region]['ConfirmedCases'] y_Fatalities_data = train_data[region]['Fatalities'] # Fit data to function try: popt, pcov = curve_fit(sigmoid, x_data, y_ConfirmedCases_data) confirmed_popt[region] = popt except: failed_confirmed.append(region) try: popt, pcov = curve_fit(sigmoid, x_data, y_Fatalities_data) fatalities_popt[region] = popt except: failed_fatalities.append(region) print(""Failed confirmed: "" + str(len(failed_confirmed))) print(""Failed fatalities: "" + str(len(failed_fatalities))) print(""Total: "" + str(len(regions)))'",No,5,84.0 "# Handle failed data confirmed_coeffs = [x for x in confirmed_popt.values()] mean_confirmed_coeffs = np.mean(confirmed_coeffs, axis=0) print(mean_confirmed_coeffs) fatalities_coeffs = [x for x in fatalities_popt.values()] mean_fatalities_coeffs = np.mean(fatalities_coeffs, axis=0) print(mean_fatalities_coeffs) for region in failed_confirmed: x_data = train_data[region]['DayNo'] y_ConfirmedCases_data = train_data[region]['ConfirmedCases'] # Fit data to function try: popt, pcov = curve_fit(sigmoid, x_data, y_ConfirmedCases_data, maxfev=1000, ftol=1e-5) confirmed_popt[region] = popt except: start = 0 for data in y_ConfirmedCases_data: if data > 0: break start = start + 1 popt = mean_confirmed_coeffs popt[1] = start confirmed_popt[region] = popt print(""Failed for C "" + region + "" : "" + str(popt)) for region in failed_fatalities: x_data = train_data[region]['DayNo'] y_Fatalities_data = train_data[region]['Fatalities'] # Fit data to function try: popt, pcov = curve_fit(sigmoid, x_data, y_Fatalities_data, maxfev=1000, ftol=1e-5) fatalities_popt[region] = popt except: start = 0 for data in y_Fatalities_data: if data > 0: break start = start + 1 popt = mean_fatalities_coeffs popt[1] = start fatalities_popt[region] = popt print(""Failed F for "" + region + "" : "" + str(popt))'",No,5,53.0 "sub = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv') test_regions = test_df.Region.unique() total_count = 0 for region in test_regions: forecastIds = test_data[region]['ForecastId'] x_test_data = test_data[region]['DayNo'] y_conf_test_data = np.nan_to_num(sigmoid(x_test_data, *confirmed_popt[region])).astype(np.int) x_test_data = test_data[region]['DayNo'] y_fatal_test_data = np.nan_to_num(sigmoid(x_test_data, *fatalities_popt[region])).astype(np.int) idx = 0 x_test_data = x_test_data.tolist() for id in forecastIds: day_no = x_test_data[idx] row_index = sub.index[sub['ForecastId'] == id] if day_no > train_max_day: sub.set_value(row_index, 'ConfirmedCases', y_conf_test_data[idx]) sub.set_value(row_index, 'Fatalities', y_fatal_test_data[idx]) else: sub.set_value(row_index, 'ConfirmedCases', int(train_data[region][train_data[region]['DayNo']==day_no]['ConfirmedCases'].tolist()[0])) sub.set_value(row_index, 'Fatalities', int(train_data[region][train_data[region]['DayNo']==day_no]['Fatalities'].tolist()[0])) idx = idx + 1 sub.to_csv('/kaggle/working/submission.csv', index=False) ",No,3,8.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) trainData = pd.read_csv('../input/train.csv') testData = pd.read_csv('../input/test.csv') trainData = trainData.drop('Id', axis=1) testData = testData.drop('Id', axis=1)",No,3,45.0 "trainData['Open Date'] = pd.to_datetime(trainData['Open Date'], format='%m/%d/%Y') testData['Open Date'] = pd.to_datetime(testData['Open Date'], format='%m/%d/%Y') trainData['OpenDays']="""" testData['OpenDays']="""" dateLastTrain = pd.DataFrame({'Date':np.repeat(['01/01/2015'],[len(trainData)]) }) dateLastTrain['Date'] = pd.to_datetime(dateLastTrain['Date'], format='%m/%d/%Y') dateLastTest = pd.DataFrame({'Date':np.repeat(['01/01/2015'],[len(testData)]) }) dateLastTest['Date'] = pd.to_datetime(dateLastTest['Date'], format='%m/%d/%Y') trainData['OpenDays'] = dateLastTrain['Date'] - trainData['Open Date'] testData['OpenDays'] = dateLastTest['Date'] - testData['Open Date'] trainData['OpenDays'] = trainData['OpenDays'].astype('timedelta64[D]').astype(int) testData['OpenDays'] = testData['OpenDays'].astype('timedelta64[D]').astype(int) trainData = trainData.drop('Open Date', axis=1) testData = testData.drop('Open Date', axis=1)'",No,4,16.0 "cityPerc = trainData[[""City Group"", ""revenue""]].groupby(['City Group'],as_index=False).mean() #sns.barplot(x='City Group', y='revenue', data=cityPerc) citygroupDummy = pd.get_dummies(trainData['City Group']) trainData = trainData.join(citygroupDummy) citygroupDummyTest = pd.get_dummies(testData['City Group']) testData = testData.join(citygroupDummyTest) trainData = trainData.drop('City Group', axis=1) testData = testData.drop('City Group', axis=1)'",No,4,10.0 "#Regression on everything from sklearn.ensemble import RandomForestRegressor import matplotlib.pyplot as plt import seaborn as sns sns.set_context(""notebook"", font_scale=1.1) sns.set_style(""ticks"") import numpy xTrain = pd.DataFrame({'OpenDays':trainData['OpenDays'].apply(numpy.log), 'Big Cities':trainData['Big Cities'], 'Other':trainData['Other'], 'P2':trainData['P2'], 'P8':trainData['P8'], 'P22':trainData['P22'], 'P24':trainData['P24'], 'P28':trainData['P28'], 'P26':trainData['P26']}) #xTrain = trainData.drop(['revenue'], axis=1) #xTrain['OpenDays'] = xTrain['OpenDays'].apply(numpy.log) yTrain = trainData['revenue'].apply(numpy.log) xTest = pd.DataFrame({'OpenDays':testData['OpenDays'].apply(numpy.log), 'Big Cities':testData['Big Cities'], 'Other':testData['Other'], 'P2':testData['P2'], 'P8':testData['P8'], 'P22':testData['P22'], 'P24':testData['P24'], 'P28':testData['P28'], 'P26':testData['P26']}) from sklearn import linear_model cls = RandomForestRegressor(n_estimators=150) cls.fit(xTrain, yTrain) pred = cls.predict(xTest) pred = numpy.exp(pred) cls.score(xTrain, yTrain)'",No,3,7.0 "pred = cls.predict(xTest) pred = numpy.exp(pred)",No,5,48.0 pred,No,5,53.0 "pred2 = [] for i in range(len(pred)): if pred[i] != float('Inf'): pred2.append(pred[i]) m = sum(pred2) / float(len(pred2)) for i in range(len(pred)): if pred[i] == float('Inf'): print(""haha"") pred[i] = m'",No,5,53.0 "# RANDOMFORESTREGRESSOR FOR CONFIRMEDCASUALTIES rfr1 = RandomForestRegressor( max_features=3, min_samples_leaf=26, min_samples_split=31, n_estimators=200, random_state=96, n_jobs=-1, )",No,5,4.0 "# RANDOMFORESTREGRESSOR FOR FATALITIES rfr2 = RandomForestRegressor( max_features=3, min_samples_leaf=17, min_samples_split=17, n_estimators=100, random_state=96, n_jobs=-1, )",No,5,4.0 "# FITTING RANDOMFORESTREGRESSOR FOR CONFIRMEDCASUALTIES rfr1.fit(X1, y1)",No,5,7.0 "# PREDICTING CONFIRMEDCASUALTIES using RANDOM FOREST REGRESSOR y_n_1 = rfr1.predict(test_1)",No,5,48.0 "# Fit CONFIRMEDCASUALTIES using K neareat neighbour algorithm Classifier from sklearn.neighbors import KNeighborsClassifier classifier = KNeighborsClassifier(n_neighbors = 4, metric = 'braycurtis', p = 1) classifier.fit(X1, y1)",No,4,7.0 "### Predict CONFIRMEDCASUALTIES using K neareat neighbour algorithm Classifier y_pred1 = classifier.predict(X1) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y1, y_pred1) from sklearn.metrics import accuracy_score print( 'Accuracy Score confirmed cases :',accuracy_score(y1,y_pred1)*100)",No,4,27.0 "# FITTING RANDOMFORESTREGRESSOR FOR FATALITIES rfr2.fit(X2, y2)",No,5,7.0 "# PREDICTING FATALITIES y_n_2 = rfr2.predict(test_2)",No,5,48.0 "# ### Fit Fatalities using K neareat neighbour algorithm Classifier from sklearn.neighbors import KNeighborsClassifier classifier = KNeighborsClassifier(n_neighbors = 4, metric = 'braycurtis', p = 1) classifier.fit(X2, y2)",No,5,7.0 "### Predict Fatalities using K neareat neighbour algorithm Classifier y_pred2 = classifier.predict(X2) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y2, y_pred2) from sklearn.metrics import accuracy_score print( 'Accuracy Score confirmed cases :',accuracy_score(y2,y_pred2)*100)",No,3,27.0 "# ADDING CONFIRMEDCASES xsubmission.ConfirmedCases = round(pd.DataFrame(y_n_1))",No,4,8.0 " # ADDING FATALITIES xsubmission.Fatalities = round(pd.DataFrame(y_n_2))",No,4,8.0 "# View submission data xsubmission",No,5,41.0 "# Save Date to submission file xsubmission.to_csv(""submission.csv"", index=False) print(""Submission file create sucessfully"")",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory import os print(os.listdir(""../input"")) # Any results you write to the current directory are saved as output.",No,5,88.0 "from numpy import loadtxt from xgboost import XGBClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score",No,5,22.0 "Train_data = pd.read_csv(""../input/train.csv"") Test_data = pd.read_csv(""../input/test.csv"") ID = Test_data['Id']'",No,5,45.0 Train_data.head(),No,5,41.0 Test_data.head(),No,5,41.0 "train_levels = Train_data.loc[(Train_data['City'].notnull())] City_counts = train_levels['City'].value_counts().sort_index().to_frame() City_counts",No,5,72.0 "train_levels = Train_data.loc[(Train_data['Type'].notnull())] label_counts = train_levels['Type'].value_counts().sort_index().to_frame() label_counts",No,5,72.0 "# lie del Train_data[""Open Date""] del Train_data[""City""] del Train_data[""City Group""] del Train_data[""Type""] del Train_data[""Id""] del Test_data[""Open Date""] del Test_data[""City""] del Test_data[""City Group""] del Test_data[""Type""] del Test_data[""Id""]'",No,5,10.0 "# 0 Train_data = Train_data.fillna(0) Test_data = Test_data.fillna(0) Test_data.head(10)'",No,4,17.0 "#Regression on everything from sklearn.ensemble import RandomForestRegressor import seaborn as sns import numpy sns.set_context(""notebook"", font_scale=1.11) sns.set_style(""ticks"") yTrain = Train_data['revenue'].apply(numpy.log) Train_data = Train_data.drop([""revenue""],1) xTrain = pd.DataFrame(Train_data) xTest = pd.DataFrame(Test_data) '",No,3,22.0 "from sklearn import linear_model from sklearn.ensemble import RandomForestRegressor cls = RandomForestRegressor(n_estimators=170) cls.fit(xTrain, yTrain) pred = cls.predict(xTest) pred = numpy.exp(pred) closs = cls.score(xTrain, yTrain) closs",No,4,7.0 "pred = cls.predict(xTest) pred = numpy.exp(pred) pred",No,4,27.0 "read_test = { ""Id"":ID, ""Prediction"":pred } read_ = pd.DataFrame(read_test) read_.to_csv(""sample_submission.csv"",index=False) ",No,4,25.0 "import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import random as rnd # visualization import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline from nltk.classify import SklearnClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier from sklearn.svm import SVR,NuSVR,LinearSVR,SVC #support vector regression from sklearn.linear_model import LinearRegression,Ridge,Lasso#Ridge() and Lasso() from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from sklearn.model_selection import StratifiedKFold test = pd.read_csv(""../input/restaurant-revenue-prediction/test.csv"") train = pd.read_csv(""../input/restaurant-revenue-prediction/train.csv"") train.head()",No,4,45.0 print(train.info()),No,5,40.0 "train['P29'] = train['P29'].astype(int) test['P29'] = test['P29'].astype(int) test[""P29""].fillna(test[""P29""].median(), inplace=True) train['P26'] = train['P26'].astype(int) test['P26'] = test['P26'].astype(int) train['P27'] = train['P27'].astype(int) test['P27'] = test['P27'].astype(int) train['P28'] = train['P28'].astype(int) test['P28'] = test['P28'].astype(int) train['P13'] = train['P13'].astype(int) test['P13'] = test['P13'].astype(int) train['P2'] = train['P2'].astype(int) test['P2'] = test['P2'].astype(int) train['P3'] = train['P3'].astype(int) test['P3'] = test['P3'].astype(int) train['P4'] = train['P4'].astype(int) test['P4'] = test['P4'].astype(int)'",No,4,16.0 train.describe(include=['object']),No,5,40.0 train['City'].value_counts(),No,5,72.0 train['City Group'].value_counts(),No,5,72.0 train['Type'].value_counts(),No,5,72.0 "corrmat = train.corr() f, ax = plt.subplots(figsize=(12, 9)) sns.heatmap(corrmat, vmax=.8, square=True);",No,5,80.0 "train[""City Group""] = train[""City Group""].map({""Big Cities"": 0, ""Other"":1}) test[""City Group""] = test[""City Group""].map({""Big Cities"": 0, ""Other"":1}) train[""Type""] = train[""Type""].map({""FC"": 0, ""IL"":1,""DT"":2}) test[""Type""] = test[""Type""].map({""FC"": 0, ""IL"":1,""DT"":2}) # Is city important or not #How can we get groups of revenue and plot it against city groups and types to compare",No,5,20.0 "test[""Type""].fillna(test[""Type""].median(), inplace=True) train[""revenue""].fillna(train[""revenue""].median(), inplace=True) train['revenue'] = train['revenue'].astype(int) import numpy Y_train=train[""revenue""].apply(numpy.log) '",No,4,17.0 "X_train = train.drop(['City','Open Date','revenue','Id','City Group'], axis=1) #X_test = test.drop(""Id"",axis=1).copy() X_test = test.drop(['City','Open Date','Id','City Group'], axis=1) X_train.head() #X_test.head()'",No,4,10.0 "#from sklearn.impute import SimpleImputer #my_imputer = SimpleImputer() #imputed_X_train = my_imputer.fit_transform(X_train) #imputed_X_test = my_imputer.transform(X_test)",No,3,12.0 "test.head() test=test.drop(['City','Open Date','City Group'], axis=1)",No,5,10.0 submission,No,5,41.0 "submission.to_csv('submission.csv', index=False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import seaborn as sns import matplotlib.pyplot as plt from scipy.stats import norm, skew # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # Any results you write to the current directory are saved as output.",No,4,22.0 "print('Start model training') start_time = time.time() history_cc = model_cc.fit(X_train_cc, y_train_cc, epochs = 100,validation_data = (X_val_cc, y_val_cc), verbose = 2, callbacks=[early_stop]) model_cc.save(""model_cc.h5"") print('Time spent for model training is {} minutes'.format(round((time.time()-start_time)/60,1)))'",No,4,7.0 "# Plot training & validation loss values plt.figure(figsize=(8,5)) plt.plot(history_cc.history['loss']) plt.plot(history_cc.history['val_loss']) plt.title('CC Model loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.show()",No,5,35.0 "print('Start model training') start_time = time.time() history_ft = model_ft.fit(X_train_ft, y_train_ft, epochs = 100,validation_data = (X_val_ft, y_val_ft), verbose = 2, callbacks=[early_stop]) model_ft.save(""model_ft.h5"") print('Time spent for model training is {} minutes'.format(round((time.time()-start_time)/60,1)))'",No,4,7.0 "# Plot training & validation loss values plt.figure(figsize=(8,5)) plt.plot(history_ft.history['loss']) plt.plot(history_ft.history['val_loss']) plt.title('FT Model loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.show()",No,5,35.0 "# Validate if output makes sense yhat_val_cc = model_cc.predict(X_val_cc) print(yhat_val_cc)",No,5,48.0 print(val_y_cc),No,5,53.0 "# Validate if output makes sense yhat_val_ft = model_cc.predict(X_val_ft) print(yhat_val_ft)",No,5,48.0 print(val_y_ft),No,5,53.0 "#TODO: takes 14m ish, consider multi-processing, multi-cores, run in GPU #TODO: create data_generate func start_time = time.time() test['Country_Region'] = test['Country_Region'].astype(str) test['Province_State'] = test['Province_State'].astype(str) country = test['Country_Region'].drop_duplicates() adj_input_cols = [e for e in input_cols if e not in ('province_encoder', 'country_encoder', 'date_int')] # fill data for overlapped days test_df = test.copy().join(pd.DataFrame(columns = adj_input_cols + output_cols)) test_df.rename(columns={'Date': 'date', 'Country_Region': 'country', 'Province_State': 'province'}, inplace=True) lags = np.arange(1,8,1) # lag of 1 to 7 test_overlap_mask = (test_df['date'] <= train_df['date'].max()) train_overlap_mask = (train_df['date'] >= test_df['date'].min()) test_df.loc[test_overlap_mask, input_cols + output_cols] = train_df.loc[train_overlap_mask, input_cols + output_cols].values # predict data for forward days pred_dt_range = pd.date_range(start = train_df['date'].max() + pd.Timedelta(days=1), end = test_df['date'].max(), freq = '1D') # test_df['date'].max() with tqdm(total = len(pred_dt_range)) as pbar: for d in pred_dt_range: for i in country: province = test_df[test_df['country'] == i]['province'].drop_duplicates() for j in province: mask = (test_df['date'] == d) & (test_df['country'] == i) & (test_df['province'] == j) # update input features for the predicted day for lag in lags: mask_org = (test_df['date'] == (d - pd.Timedelta(days=lag))) & (test_df['country'] == i) & (test_df['province'] == j) try: test_df.loc[mask, 'cc_cases_' + str(lag)] = test_df.loc[mask_org, 'cc_cases'].values except: test_df.loc[mask, 'cc_cases_' + str(lag)] = 0 try: test_df.loc[mask, 'ft_cases_' + str(lag)] = test_df.loc[mask_org, 'ft_cases'].values except: test_df.loc[mask, 'ft_cases_' + str(lag)] = 0 test_X = test_df.loc[mask, input_cols] # predict for comfirmed cases test_X_cc = test_X[cc_input] X_test_cc= test_X_cc # X_test_cc = X_scaler_cc.transform(test_X_cc) # intput/output 2D array-like # X_test_cc = X_test_cc.reshape(X_test_cc.shape[0], 1, X_test_cc.shape[1]) X_test_cc = X_test_cc.to_numpy().reshape(X_test_cc.shape[0], 1, X_test_cc.shape[1]) next_cc = model_cc.predict(X_test_cc) # next_cc_scaled = y_scaler_cc.inverse_transform(next_cc) next_cc_scaled = next_cc # predict for fatal cases test_X_ft = test_X[ft_input] X_test_ft = test_X_ft # X_test_ft = X_scaler_ft.transform(test_X_ft) # intput/output 2D array-like # X_test_ft = X_test_ft.reshape(X_test_ft.shape[0], 1, X_test_ft.shape[1]) X_test_ft = X_test_ft.to_numpy().reshape(X_test_ft.shape[0], 1, X_test_ft.shape[1]) next_ft = model_cc.predict(X_test_ft) # next_ft_scaled = y_scaler_cc.inverse_transform(next_ft) next_ft_scaled = next_ft # print(d, ' - ', i, ' - ', j, ' - Predicted Confirmed Cases are ', next_cc_scaled, ' - Predicted Fatal Cases are ', next_ft_scaled) # update yhat for next day test_df.loc[mask, 'cc_cases'] = next_cc_scaled test_df.loc[mask, 'ft_cases'] = next_ft_scaled pbar.update(1) print('Time spent for building features is {} minutes'.format(round((time.time()-start_time)/60,1)))",No,2,8.0 "submission = pd.DataFrame() submission['ForecastId'] = test_df['ForecastId'] submission['ConfirmedCases'] = test_df['cc_cases'] submission['Fatalities'] = test_df['ft_cases']",No,5,12.0 "submission.to_csv(""submission.csv"",index=False)",No,5,25.0 submission[:20],No,5,41.0 "import warnings warnings.filterwarnings('ignore')",No,5,23.0 "import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns",No,5,22.0 "df_train = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/train.csv.zip') df_test = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/test.csv.zip') df_train.shape, df_test.shape",No,4,45.0 train_revenue = df_train.pop('revenue'),No,5,10.0 df_train.isnull().sum().sort_values(ascending=False),No,5,39.0 df_test.isnull().sum().sort_values(ascending=False),No,5,39.0 df_train['Open Date'] = df_train['Open Date'].str.split('/').apply(lambda x : x[2]),No,5,8.0 df_test['Open Date'] = df_test['Open Date'].str.split('/').apply(lambda x : x[2]),No,5,8.0 "df_train.shape, df_test.shape",No,5,58.0 "df_train.drop(columns=[""Id""],inplace=True) df_test_index = df_test.pop('Id')'",No,5,10.0 "from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder(handle_unknown='ignore')",No,3,20.0 "df_train_ohe = ohe.fit_transform(df_train) df_train_ohe = df_train_ohe.todense()",No,5,20.0 "df_test_ohe = ohe.transform(df_test) df_test_ohe = df_test_ohe.toarray()",No,5,20.0 df_train_ohe.shape,No,5,58.0 df_test_ohe.shape,No,5,58.0 "from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import GridSearchCV",No,5,22.0 "param = { ""n_estimators"": range(10,20,2), ""learning_rate"": [0.0001,0.001,0.01,0.1], ""loss"" : ['ls', 'lad', 'huber', 'quantile'], ""min_samples_split"": range(10,15,2), ""min_samples_leaf"": range(10,15,2), ""max_depth"": range(10,20,2), ""alpha"": [0,0.1,0.3,0.5,0.7,0.9] }'",No,5,5.0 "gbr = GradientBoostingRegressor(alpha=0.1, ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.1, loss='ls', max_depth=10, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=10, min_samples_split=10, min_weight_fraction_leaf=0.0, n_estimators=10, n_iter_no_change=None, presort='deprecated', random_state=17, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False)",No,5,4.0 "gbr.fit(df_train_ohe,train_revenue)",No,5,7.0 "train_revenue_predict = gbr.predict(df_train_ohe) test_revenue = gbr.predict(df_test_ohe)",No,4,27.0 "from sklearn.metrics import mean_squared_error mse = mean_squared_error(train_revenue_predict,train_revenue) rmse = np.sqrt(mse) print(rmse)",No,5,28.0 "df_submit = pd.DataFrame({'Id': df_test_index, 'Prediction': test_revenue})",No,5,12.0 "df_submit.to_csv('submit.csv',index=False) df_submit.head()",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) %matplotlib inline import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn import datasets from sklearn.metrics import mean_squared_error from sklearn import ensemble # Input data files are available in the read-only ""../input/"" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,5,88.0 "data = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/train.csv.zip') data.head()",No,4,45.0 data.isnull().sum(),No,5,39.0 y_train = data.pop('revenue'),No,5,10.0 "data = data.drop(data.columns[[0, 1,2,3,4]], axis=1) x_train=data[:] data.head()",No,4,10.0 "from sklearn.ensemble import GradientBoostingRegressor model = GradientBoostingRegressor() model.fit(x_train, y_train)",No,5,7.0 "from sklearn.ensemble import GradientBoostingRegressor learning_rates = [1, 0.5, 0.25, 0.1, 0.05, 0.01] #learing_rates=[] train_results = [] test_results = [] #beta=range(.05,1.05,.05) for eta in learning_rates: #learning_rates.append(eta) model = GradientBoostingRegressor(learning_rate=eta) model.fit(x_train, y_train) from sklearn.metrics import mean_squared_error, r2_score model_score = model.score(x_train,y_train) # Have a look at R sq to give an idea of the fit , # Explained variance score: 1 is perfect prediction y_predicted_train=model.predict(x_train) train_results.append(mean_squared_error(y_train, y_predicted_train)) print('R2 sq: ',model_score)",No,5,2.0 "from matplotlib.legend_handler import HandlerLine2D line1, = plt.plot(learning_rates, train_results, 'b', label=""Training MSE"") plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)}) plt.ylabel('Mean Squared Error') plt.xlabel('Learning Rate') plt.show()'",No,5,35.0 "from sklearn.ensemble import GradientBoostingRegressor n_estimators = [ 1, 2, 4, 8, 16, 32, 64, 100, 200, 500, 1000, 2000] train_results = [] test_results = [] for estimator in n_estimators: model = GradientBoostingRegressor(n_estimators=estimator) model.fit(x_train, y_train) from sklearn.metrics import mean_squared_error, r2_score model_score = model.score(x_train,y_train) # Have a look at R sq to give an idea of the fit , # Explained variance score: 1 is perfect prediction y_predicted_train=model.predict(x_train) train_results.append(mean_squared_error(y_train, y_predicted_train)) print('R2 sq: ',model_score)",No,5,2.0 "from matplotlib.legend_handler import HandlerLine2D line1, = plt.plot(n_estimators, train_results, 'b', label=""Training MSE"") plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)}) plt.ylabel('Mean Squared Error') plt.xlabel('Number of Estimators') plt.show()'",No,5,35.0 "from sklearn.ensemble import GradientBoostingRegressor #min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True) min_samples_splits=[2,3,4,5,6,7,8,9,10] train_results = [] test_results = [] for min_samples_split in min_samples_splits: model = GradientBoostingRegressor(min_samples_split=min_samples_split) model.fit(x_train, y_train) from sklearn.metrics import mean_squared_error, r2_score model_score = model.score(x_train,y_train) # Have a look at R sq to give an idea of the fit , # Explained variance score: 1 is perfect prediction y_predicted_train=model.predict(x_train) train_results.append(mean_squared_error(y_train, y_predicted_train)) print('R2 sq: ',model_score)",No,5,2.0 "from matplotlib.legend_handler import HandlerLine2D line1, = plt.plot(min_samples_splits, train_results, 'b', label=""Training MSE"") plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)}) plt.ylabel('Mean Squared Error') plt.xlabel('Min Samples Split') plt.show()'",No,5,84.0 "from sklearn.ensemble import GradientBoostingRegressor max_depths = [1,2,3,4,5,6,7,8,9,10] train_results = [] test_results = [] for max_depth in max_depths: model = GradientBoostingRegressor(max_depth=max_depth) model.fit(x_train, y_train) from sklearn.metrics import mean_squared_error, r2_score model_score = model.score(x_train,y_train) # Have a look at R sq to give an idea of the fit , # Explained variance score: 1 is perfect prediction y_predicted_train=model.predict(x_train) train_results.append(mean_squared_error(y_train, y_predicted_train)) print('R2 sq: ',model_score)",No,5,2.0 "from matplotlib.legend_handler import HandlerLine2D line1, = plt.plot(max_depths, train_results, 'b', label=""Training MSE"") plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)}) plt.ylabel('Mean Squared Error') plt.xlabel('No. of Depth') plt.show()'",No,5,35.0 "from sklearn.ensemble import GradientBoostingRegressor #min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True) min_samples_leafs = [1,2,3,4,5,6,7,8,9,10] train_results = [] test_results = [] for min_samples_leaf in min_samples_leafs: model = GradientBoostingRegressor(min_samples_leaf=min_samples_leaf) model.fit(x_train, y_train) from sklearn.metrics import mean_squared_error, r2_score model_score = model.score(x_train,y_train) # Have a look at R sq to give an idea of the fit , # Explained variance score: 1 is perfect prediction y_predicted_train=model.predict(x_train) train_results.append(mean_squared_error(y_train, y_predicted_train)) print('R2 sq: ',model_score)",No,5,2.0 "from matplotlib.legend_handler import HandlerLine2D line1, = plt.plot(min_samples_leafs, train_results, 'b', label=""Training MSE"") plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)}) plt.ylabel('Mean Squared Error') plt.xlabel('No. of Samples Leaf') plt.show()'",No,5,35.0 "from sklearn.ensemble import GradientBoostingRegressor max_features = list(range(1,data.shape[1])) train_results = [] test_results = [] for max_feature in max_features: model = GradientBoostingRegressor(max_features=max_feature) model.fit(x_train, y_train) from sklearn.metrics import mean_squared_error, r2_score model_score = model.score(x_train,y_train) # Have a look at R sq to give an idea of the fit , # Explained variance score: 1 is perfect prediction y_predicted_train=model.predict(x_train) train_results.append(mean_squared_error(y_train, y_predicted_train)) print('R2 sq: ',model_score)",No,5,2.0 "from matplotlib.legend_handler import HandlerLine2D line1, = plt.plot(max_features, train_results, 'b', label=""Training MSE"") plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)}) plt.ylabel('Mean Squared Error') plt.xlabel('No. of Feature') plt.show()'",No,5,35.0 "data1 = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/test.csv.zip') data1.head()",No,4,45.0 "data1=data1.drop(data1.columns[[0, 1,2,3,4]], axis=1)",No,5,10.0 data1.head(),No,5,41.0 sample =pd.read_csv('/kaggle/input/restaurant-revenue-prediction/sampleSubmission.csv'),No,5,45.0 "params = {'n_estimators': 225, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.25, 'loss': 'ls','max_features':10, 'min_samples_leaf':2} model = ensemble.GradientBoostingRegressor(**params) model.fit(x_train, y_train) sample[""Prediction""] = model.predict(data1)'",No,3,4.0 "sample.to_csv('submission.csv', index = False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the read-only ""../input/"" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,5,88.0 " import pandas as pd import numpy as np ",No,5,22.0 "import pandas as pd df_train = pd.read_csv('../input/restaurant-revenue-prediction/train.csv.zip') df_test = pd.read_csv('../input/restaurant-revenue-prediction/test.csv.zip')",No,4,45.0 "train_corr = df_train.corr() print(train_corr) train_corr.to_csv(""corr.csv"", index = False)",No,4,25.0 "import datetime from sklearn.preprocessing import LabelEncoder revenue = df_train[""revenue""]",No,4,22.0 "del df_train[""revenue""]",No,5,10.0 "df_whole = pd.concat([df_train, df_test], axis=0)",No,5,11.0 "df_whole[""Open Date""] = pd.to_datetime(df_whole[""Open Date""]) df_whole[""Year""] = df_whole[""Open Date""].apply(lambda x:x.year) df_whole[""Month""] = df_whole[""Open Date""].apply(lambda x:x.month) df_whole[""Day""] = df_whole[""Open Date""].apply(lambda x:x.day)",No,4,8.0 "le = LabelEncoder() df_whole[""City""] = le.fit_transform(df_whole[""City""])",No,5,20.0 "df_whole[""City Group""] = df_whole[""City Group""].map({""Other"":0, ""Big Cities"":1})",No,5,20.0 "df_whole[""Type""] = df_whole[""Type""].map({""FC"":0, ""IL"":1, ""DT"":2, ""MB"":3})",No,5,20.0 "df_train = df_whole.iloc[:df_train.shape[0]] df_test = df_whole.iloc[df_train.shape[0]:]",No,5,13.0 "from sklearn.ensemble import RandomForestRegressor # df_train_columns = [col for col in df_train.columns if col not in [""Id"", ""Open Date""]] #RandomForest rf = RandomForestRegressor( n_estimators=200, max_depth=5, max_features=0.5, random_state=449, n_jobs=-1 ) rf.fit(df_train[df_train_columns], revenue)'",No,2,7.0 prediction = rf.predict(df_test[df_train_columns]),No,5,48.0 "submission = pd.DataFrame({""Id"":df_test.Id, ""Prediction"":prediction}) submission.to_csv(""TFI_submission.csv"", index=False)",No,4,25.0 "df = pd.read_csv('../input/restaurant-revenue-prediction/train.csv.zip', index_col='Id') df",No,5,45.0 "y = df.revenue X = df.drop(columns=['revenue'], axis=1)",No,5,21.0 "y ",No,5,41.0 X,No,5,41.0 "for col in X.columns: print(col, df[col].dtype)",No,5,70.0 "y.isna().sum() ",No,5,39.0 "X_num = X.select_dtypes(exclude=['object']) X_num",No,5,84.0 "df.shape ",No,5,58.0 "X_num.shape ",No,5,58.0 "for col in X_num.columns: if X_num[col].isna().sum() > 0: print(col, X_num[col].isna().sum() / len(X_num) )",No,5,39.0 "from sklearn.impute import SimpleImputer imputer = SimpleImputer() X_num_imputed = pd.DataFrame(imputer.fit_transform(X_num))",No,4,17.0 "parameters = { 'n_estimators': list(range(100, 1001, 100)), 'max_leaf_nodes': list(range(2, 70, 5)), 'max_depth': list(range(6, 70, 5)) } parameters",No,5,5.0 "my_randome_state=1486 ",No,5,77.0 "from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestRegressor gsearch = GridSearchCV(estimator=RandomForestRegressor(random_state=my_randome_state), param_grid = parameters, scoring='neg_mean_absolute_error', n_jobs=4,cv=5, verbose=7)",No,4,5.0 "gsearch.fit(X_num_imputed, y) ",No,5,6.0 "best_n_estimators = gsearch.best_params_.get('n_estimators') best_n_estimators",No,5,2.0 "best_max_leaf_nodes = gsearch.best_params_.get('max_leaf_nodes') best_max_leaf_nodes",No,5,2.0 "best_max_depth = gsearch.best_params_.get('max_depth') best_max_depth",No,5,2.0 "final_model = RandomForestRegressor(n_estimators=best_n_estimators, random_state=my_randome_state, max_leaf_nodes=best_max_leaf_nodes, max_depth=best_max_depth)",No,5,4.0 "final_model.fit(X_num_imputed, y) ",No,5,7.0 "X_test = pd.read_csv('../input/restaurant-revenue-prediction/test.csv.zip', index_col='Id') X_test ",No,5,45.0 "X_test_num = X_test.select_dtypes(exclude=['object']) ",No,5,14.0 "for col in X_test_num.columns: if X_test_num[col].isna().sum() > 0: print(col, X_test_num[col].isna().sum() / len(X_test_num) )",No,5,39.0 "X_test_num_imputed = pd.DataFrame(imputer.transform(X_test_num)) X_test_num_imputed.columns = X_test_num.columns X_test_num_imputed",No,5,61.0 "preds_test = final_model.predict(X_test_num_imputed) ",No,5,48.0 "len(preds_test) ",No,5,40.0 "# Save test predictions to file output = pd.DataFrame({'Id': X_test.index, 'Prediction': preds_test}) output",No,5,55.0 "output.to_csv('submission.csv', index=False) print('done!')",No,5,25.0 "data_train = pd.read_csv('../input/restaurant-revenue-prediction/train.csv.zip',index_col='Id', parse_dates=[""Open Date""]) data_test = pd.read_csv('../input/restaurant-revenue-prediction/test.csv.zip',index_col='Id', parse_dates=[""Open Date""]) data_train'",No,4,45.0 data_train.describe(),No,5,40.0 data_train.isnull().sum(),No,5,39.0 "for i in data_train.columns: print(i ,': ',len(data_train[i].unique()))",No,5,54.0 "columnsForDrop = ['Open Date'] data_train.drop(columns=columnsForDrop, inplace=True) ################################ data_test.drop(columns=columnsForDrop, inplace=True) data_train",No,4,10.0 "s = (data_train.dtypes == 'object') object_cols = list(s[s].index) print(""Categorical variables:"") print(object_cols)'",No,4,37.0 "from sklearn.preprocessing import LabelEncoder # Make copy to avoid changing original data label_X_train = data_train.copy() label_X_test = data_test.copy() # Apply label encoder to each column with categorical data label_encoder = LabelEncoder() for col in object_cols: label_encoder.fit(pd.concat([data_train[col], data_test[col]], axis=0, sort=False)) label_X_train[col] = label_encoder.transform(data_train[col]) label_X_test[col] = label_encoder.transform(data_test[col])",No,5,20.0 "data_train = label_X_train data_test = label_X_test data_train",No,4,41.0 "y = data_train.revenue ############################################ X = data_train.drop(columns=['revenue'])",No,5,21.0 "from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score, classification_report, f1_score from sklearn.neighbors import KNeighborsClassifier",No,5,22.0 "from xgboost import XGBRegressor from sklearn.metrics import mean_absolute_error ,explained_variance_score, mean_squared_error #########################################################################3 from sklearn.ensemble import RandomForestRegressor parameters = {'max_depth': list(range(6, 30, 10)), 'max_leaf_nodes': list(range(50, 500, 100)), 'n_estimators': list(range(50, 1001, 150))} parameters1 = {'max_depth': [6], 'max_leaf_nodes': [250], 'n_estimators': [100]} from sklearn.model_selection import GridSearchCV gsearch = GridSearchCV(estimator=RandomForestRegressor(), param_grid = parameters, scoring='neg_mean_squared_error', n_jobs=4,cv=5,verbose=7) gsearch.fit(X, y)",No,4,6.0 "print(gsearch.best_params_.get('n_estimators')) print(gsearch.best_params_.get('max_leaf_nodes')) print(gsearch.best_params_.get('max_depth'))",No,5,2.0 "print(data_train.shape) print(data_test.shape) print(X.shape)",No,5,58.0 "final_model = RandomForestRegressor( max_depth = gsearch.best_params_.get('max_depth'), max_leaf_nodes = gsearch.best_params_.get('max_leaf_nodes'), n_estimators = gsearch.best_params_.get('n_estimators'),random_state=1, n_jobs=4) final_model.fit(X, y)",No,4,6.0 "preds = final_model.predict(data_test) print(preds.shape) print(data_test.shape)",No,4,48.0 "testData = pd.read_csv(""../input/restaurant-revenue-prediction/test.csv.zip"") submission = pd.DataFrame({ ""Id"": testData[""Id""], ""Prediction"": preds }) submission.to_csv('RandomForestSimple.csv',header=True, index=False) print('Done')'",No,4,25.0 "df_train=pd.read_csv(""/kaggle/input/restaurant-revenue-prediction/train.csv.zip"",index_col='Id') X_test = pd.read_csv('../input/restaurant-revenue-prediction/test.csv.zip',index_col='Id') df_train.shape,X_test.shape '",No,4,45.0 "train_with_missing = [col for col in df_train.columns if df_train[col].isnull().any()] test_with_missing = [col for col in X_test.columns if X_test[col].isnull().any()] train_with_missing,test_with_missing",No,3,17.0 "y_train = df_train.revenue X_train = df_train.drop(columns=['revenue'], axis=1)",No,5,21.0 "X_train.shape,X_test.shape",No,5,58.0 "b""import matplotlib.pyplot as plt\n\nd_names = ('train.csv.zip', 'test.csv.zip')\ny_pos = range(len(d_names))\n \nplt.bar(\n y_pos, \n (X_train.shape[0], X_test.shape[0]), \n align='center', \n alpha=0.8\n)\nplt.xticks(y_pos, d_names)\nplt.ylabel('Number of rows') \nplt.title(' Wow!')\nplt.show()""",No,4,33.0 "bad_label_cols = list(set(X_train.columns)-set(X_test.columns)) bad_label_cols",No,4,37.0 X_train['City'].value_counts(),No,5,72.0 X_train.Type.value_counts(),No,5,72.0 "X_train['year'] = pd.DatetimeIndex(X_train['Open Date']).year X_train.drop(columns=['Open Date','City'],inplace=True)",No,4,10.0 "X_test['year'] = pd.DatetimeIndex(X_test['Open Date']).year X_test.drop(columns=['Open Date','City'],inplace=True)",No,4,10.0 "from sklearn.preprocessing import OneHotEncoder # Apply one-hot encoder to each column with categorical data OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False) OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols])) OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test[object_cols])) # One-hot encoding removed index; put it back OH_cols_train.index = X_train.index OH_cols_test.index = X_test.index # Remove categorical columns (will replace with one-hot encoding) num_X_train = X_train.drop(object_cols, axis=1) num_X_test = X_test.drop(object_cols, axis=1) # Add one-hot encoded columns to numerical features OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1) OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)",Yes,3,20.0 OH_X_train.head(),No,5,41.0 OH_X_test.head(),No,5,41.0 "train_stats=OH_X_train.describe().transpose() train_stats ",No,5,40.0 "def norm(x): return (x - train_stats['mean']) / train_stats['std'] normed_train_data = norm(OH_X_train) normed_test_data = norm(OH_X_test)",No,5,18.0 normed_test_data.head(),No,5,41.0 "parameters = { 'n_estimators': list(range(10, 300, 20)), 'learning_rate': [l / 100 for l in range(5, 100, 20)], 'max_depth': list(range(1, 20,3)), 'gamma':[2,3], 'eta':[0.8,0.9], 'reg_alpha':[0.5,0.6,0.7,0.8], 'reg_lambda':[0.5,0.6,0.7,0.8] } parameters",No,5,5.0 my_randome_state=70,No,5,77.0 "from sklearn.model_selection import GridSearchCV from xgboost import XGBRegressor gsearch = GridSearchCV(estimator=XGBRegressor(random_state=my_randome_state), param_grid = parameters, scoring='neg_root_mean_squared_error', n_jobs=4,cv=5, verbose=7) gsearch.fit(normed_train_data, y_train)",No,4,6.0 "best_n_estimators = gsearch.best_params_.get('n_estimators') best_learning_rate = gsearch.best_params_.get('learning_rate') best_max_depth = gsearch.best_params_.get('max_depth') best_max_gamma = gsearch.best_params_.get('gamma') best_max_eta = gsearch.best_params_.get('eta') best_max_reg_alpha = gsearch.best_params_.get('reg_alpha') best_max_reg_lambda = gsearch.best_params_.get('reg_lambda') best_max_depth,best_n_estimators,best_learning_rate,best_max_gamma,best_max_eta,best_max_reg_alpha,best_max_reg_lambda",No,5,2.0 "final_model = XGBRegressor(n_estimators=best_n_estimators, random_state=my_randome_state, learning_rate=best_learning_rate, max_depth=best_max_depth, gamma=best_max_gamma, eta=best_max_eta, reg_alpha=best_max_reg_alpha, reg_lambda=best_max_reg_lambda) final_model.fit(normed_train_data, y_train)",No,4,7.0 preds_test = final_model.predict(normed_test_data),No,5,48.0 "# zip import zipfile with zipfile.ZipFile(""/kaggle/input/restaurant-revenue-prediction/test.csv.zip"") as zf: zf.extractall() with zipfile.ZipFile(""/kaggle/input/restaurant-revenue-prediction/train.csv.zip"") as zf: zf.extractall() '",No,3,44.0 "# df_train = pd.read_csv(""train.csv"") df_test = pd.read_csv(""test.csv"") df_submission = pd.read_csv(""/kaggle/input/restaurant-revenue-prediction/sampleSubmission.csv"")'",No,5,45.0 df_test,No,5,41.0 "# corrmat = df_train.corr() # corrmat'",No,5,40.0 "# y_train = df_train[""revenue""] del df_train[""revenue""]'",No,4,21.0 "# df_all = pd.concat([df_train, df_test], axis=0) # axis=0 : '",No,5,11.0 "# OpenDate # timestamp df_all[""Open Date""] = pd.to_datetime(df_all[""Open Date""]) df_all[""Year""] = df_all[""Open Date""].dt.year df_all[""Month""] = df_all[""Open Date""].dt.month df_all[""Day""] = df_all[""Open Date""].dt.day'",No,4,8.0 "dftrain['Open Date'] = dftrain['Open Date'].apply(lambda x: pd.to_datetime(x)) dftest['Open Date'] = dftest['Open Date'].apply(lambda x: pd.to_datetime(x)) def calc_days(dtime): now_time = pd.to_datetime('2015-01-01') result = now_time - dtime return int(result.total_seconds()/3600//24) dftrain['days_since_open'] = dftrain['Open Date'].apply(lambda x: calc_days(x)) dftest['days_since_open'] = dftest['Open Date'].apply(lambda x: calc_days(x))",No,3,8.0 dftrain['revenue'] = dftrain['revenue'].astype(int),No,5,16.0 " for col in dftrain.columns: if (dftrain[col].dtype == int) | (dftrain[col].dtype == float): if col not in ['P2','P3', 'P7','revenue']: dftrain[col] = dftrain[col].map(lambda x:np.log1p(x)) print (col) dftest[col] = dftest[col].map(lambda x: np.log1p(x)) ",No,5,20.0 pd.get_dummies(dftrain).columns,No,5,71.0 "#TFI (tab food investments) has provided a dataset with 137 restaurants in the training set, and a test set of 100000 restaurants.. data = pd.read_csv('../input/restaurant-revenue-prediction/train.csv') test_data = pd.read_csv('../input/restaurant-revenue-prediction/test.csv')",No,5,45.0 data.describe(),No,5,40.0 data.head(),No,5,41.0 pd.get_dummies(dftest).columns[50:],No,5,53.0 "#log transforming dftrain revenue dftrain['revenue']= dftrain.revenue.apply(lambda x: np.log1p(x))",No,5,8.0 "dftrain1 = pd.get_dummies(dftrain, drop_first = True) dftest1 = pd.get_dummies(dftest,drop_first = True)",No,5,20.0 "cat_cols = ['City', 'Open Date', 'City']",No,3,10.0 final_pred = model.predict(dftest1),No,2,10.0 "sampledf = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/sampleSubmission.csv') ",No,2,7.0 sampledf.head(),No,5,41.0 sub = pd.DataFrame(),No,5,12.0 "sub['Id'] = sampledf.Id sub['Prediction'] = np.expm1(final_pred)",No,5,55.0 "sub.to_csv('submission.csv', index = False)",No,5,25.0 "train = pd.read_csv(""../input/restaurant-revenue-prediction/train.csv.zip"")",No,5,45.0 train.shape,No,5,58.0 "train[""City""].value_counts()",No,5,72.0 "train[""City""] = (train[""City""]==""stanbul"").astype(np.int)'",No,5,16.0 "train[""City Group""].value_counts()",No,5,72.0 "train[""City Group""] = l.fit_transform(train[""City Group""])",No,3,16.0 "train[""Type""].value_counts()",No,5,72.0 "train[""Type""] = l.fit_transform(train[""Type""])",No,5,20.0 "train[""year""]=0 for i in range(len(train[""Open Date""])): a=train[""Open Date""][i].split(""/"") train[""year""][i]=a[2] train[""month""]=0 for i in range(len(train[""Open Date""])): a=train[""Open Date""][i].split(""/"") train[""month""][i]=a[0] train[""day_No""]=0 for i in range(len(train[""Open Date""])): a=train[""Open Date""][i].split(""/"") train[""day_No""][i]=a[1] ",No,5,8.0 "test = pd.read_csv(""../input/restaurant-revenue-prediction/test.csv.zip"")",No,5,45.0 test.shape,No,5,58.0 "test[""City""] = (test[""City""]==""stanbul"").astype(np.int) test[""Type""] = l.fit_transform(test[""Type""]) test[""City Group""] = l.fit_transform(test[""City Group""])'",No,4,16.0 "test[""year""]=0 for i in range(len(test[""Open Date""])): a=test[""Open Date""][i].split(""/"") test[""year""][i]=a[2] test[""month""]=0 for i in range(len(test[""Open Date""])): a=test[""Open Date""][i].split(""/"") test[""month""][i]=a[0] test[""day_No""]=0 for i in range(len(test[""Open Date""])): a=test[""Open Date""][i].split(""/"") test[""day_No""][i]=a[0] ",No,5,8.0 "x_train = train.drop(columns=[""Id"",""revenue"",""Open Date""],axis=1) y_train = train[""revenue""] x_test = test.drop(columns=[""Id"",""Open Date""],axis=1) x_train.shape,x_test.shape,y_train.shape",No,5,21.0 "xtrain = s.fit_transform(x_train) x_train = pd.DataFrame(x_train,columns=x_train.columns) xtest = s.fit_transform(x_test) x_test = pd.DataFrame(x_test,columns=x_test.columns)",No,4,18.0 "x_train.shape,x_test.shape,y_train.shape",No,5,58.0 "from xgboost import XGBRegressor model = XGBRegressor() model.fit(x_train,y_train) y_pred = model.predict(x_test)",No,1,48.0 "s = pd.read_csv(""../input/restaurant-revenue-prediction/sampleSubmission.csv"")",No,5,45.0 s.head(),No,5,41.0 f = pd.DataFrame(f),No,5,12.0 "f.to_csv(""submission.csv"",index=False)",No,5,25.0 f.head(),No,5,41.0 "df=pd.read_csv(""/kaggle/input/restaurant-revenue-prediction/train.csv.zip"") df.head()",No,3,45.0 df.describe(),No,5,40.0 "import matplotlib.pyplot as plt import seaborn as sns",No,5,22.0 df.columns,No,5,71.0 "df=df.drop('Id',axis=1)",No,5,10.0 "df.columns ",No,5,71.0 "df['Open Date']=pd.to_datetime(df['Open Date']) df",No,5,16.0 "df['month']=[x.month for x in df['Open Date']] df['year']=[x.year for x in df['Open Date']]",No,5,8.0 "df ",No,5,41.0 df.groupby('month')['revenue'].mean(),No,5,60.0 "sns.barplot('month','revenue',data=df)",No,5,33.0 "df=df.drop('Open Date',axis=1) df['Type'].value_counts() ty={'FC':0,'IL':1,'DT':2} df['Type']=df['Type'].map(ty)",No,3,8.0 df['City Group'].value_counts(),No,3,16.0 "cg={'Big Cities':0,'Other':1} df['City Group']=df['City Group'].map(cg)",No,5,20.0 df['City'].value_counts(),No,5,72.0 "x=0 c={} for i in df['City'].unique(): c.update({i:x}) x=x+1 ",No,5,53.0 c,No,5,53.0 "from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.neighbors import KNeighborsRegressor from xgboost import XGBRegressor",No,5,22.0 "from sklearn.metrics import mean_absolute_error ,mean_squared_error,r2_score",No,5,22.0 "dr=DecisionTreeRegressor() dr=dr.fit(X_train,Y_train) pred=dr.predict(X_test) s=mean_absolute_error(Y_test,pred) s1=mean_squared_error(Y_test,pred) s2=r2_score(Y_test,pred) print(""The MAE with the DecisionTreeRegressor is: ""+str(s)) print(""The MsE with the DecisionTreeRegressor is: ""+str(s1)) print(""The R2_Score with the DecisionTreeRegressor is: ""+str(s2)) ",No,4,49.0 "r=RandomForestRegressor() r=r.fit(X_train,Y_train) pred=r.predict(X_test) s=mean_absolute_error(Y_test,pred) s1=mean_squared_error(Y_test,pred) s2=r2_score(Y_test,pred) print(""The MAE with the RandomForestRegressor is: ""+str(s)) print(""The MsE with the RandomForestRegressor is: ""+str(s1)) print(""The R2_Score with the RandomForestRegressor is: ""+str(s2))",No,4,49.0 "x=XGBRegressor() x=dr.fit(X_train,Y_train) pred=x.predict(X_test) s=mean_absolute_error(Y_test,pred) s1=mean_squared_error(Y_test,pred) s2=r2_score(Y_test,pred) print(""The MAE with the XGBRegressor is: ""+str(s)) print(""The MsE with the XGBRegressor is: ""+str(s1)) print(""The R2_Score with the XGBRegressor is: ""+str(s2))",Yes,3,7.0 "df_t=pd.read_csv(""/kaggle/input/restaurant-revenue-prediction/test.csv.zip"") df_t.head() i_d=df_t['Id'] df_t=df_t.drop('Id',axis=1)'",Yes,3,45.0 df_t['Open Date']=pd.to_datetime(df_t['Open Date']),Yes,5,16.0 "df_t['month']=[x.month for x in df_t['Open Date']] df_t['year']=[x.year for x in df_t['Open Date']]",Yes,5,8.0 "df_t=df_t.drop('Open Date',axis=1) df_t['Type'].value_counts() ty={'FC':0,'IL':1,'DT':2} df_t['Type']=df_t['Type'].map(ty) cg={'Big Cities':0,'Other':1} df_t['City Group']=df_t['City Group'].map(cg) x=0 c={} for i in df_t['City'].unique(): c.update({i:x}) x=x+1 df_t['City']=df_t['City'].map(c)",Yes,5,8.0 df_t.head(),No,2,45.0 df_t.dropna,No,5,17.0 df_t['Type']=df_t['Type'].fillna(0),No,5,17.0 df_t.info(),No,5,40.0 " p=k.predict(df_t) ",No,5,48.0 "sub=pd.read_csv(""/kaggle/input/restaurant-revenue-prediction/sampleSubmission.csv"") sub['Id']=i_d'",No,5,45.0 "sub.to_csv(""Submission1.csv"",index=False)",No,5,25.0 "b""import pandas as pd\n\ntrain = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/train.csv.zip')\ntest = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/test.csv.zip')\n\n# Id\ntrain_Id = train.Id\ntest_Id = test.Id\n\n# Id\ntrain.drop('Id', axis=1, inplace=True)\ntest.drop('Id', axis=1, inplace=True)""",No,3,45.0 "#importing the libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt # import seaborn as sns import seaborn as sns; sns.set(style=""ticks"", color_codes=True) from datetime import datetime from scipy import stats from scipy.stats import norm, skew from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split import lightgbm as lgb # 100() # pd.options.display.max_columns = None pd.options.display.max_rows = 80 # 2() pd.options.display.float_format = '{:.2f}'.format %matplotlib inline # import warnings warnings.filterwarnings('ignore') %matplotlib inline'",No,4,23.0 "print('Size of train data', train.shape) print('Size of test data', test.shape)",No,5,58.0 train.describe(include='O'),No,5,40.0 "train[""revenue""].describe()",No,5,40.0 "#revenueQ-Q # fig = plt.figure(figsize=(10, 4)) plt.subplots_adjust(wspace=0.4) # ax = fig.add_subplot(1, 2, 1) sns.distplot(train['revenue'], ax=ax) # QQ ax2 = fig.add_subplot(1, 2, 2) stats.probplot(train['revenue'], plot=ax2) plt.show() # print(train['revenue'].describe()) print(""------------------------------"") print("": %f"" % train['revenue'].skew()) print("": %f"" % train['revenue'].kurt())'",No,3,33.0 "# dataframe df = train.copy() #log(x+1) df['revenue'] = np.log1p(df['revenue']) # (0, 1) scaler=StandardScaler() df['revenue']=scaler.fit_transform(df[['revenue']]) # fig = plt.figure(figsize=(10, 4)) plt.subplots_adjust(wspace=0.4) # ax = fig.add_subplot(1, 2, 1) sns.distplot(df['revenue'], ax=ax) # QQ ax2 = fig.add_subplot(1, 2, 2) stats.probplot(df['revenue'], plot=ax2) plt.show() # print(df['revenue'].describe()) print(""------------------------------"") print("": %f"" % df['revenue'].skew()) print("": %f"" % df['revenue'].kurt())'",Yes,2,33.0 "# dataframe df = train.copy() # (0, 1) scaler=StandardScaler() df['revenue']=scaler.fit_transform(df[['revenue']]) # fig = plt.figure(figsize=(10, 4)) plt.subplots_adjust(wspace=0.4) # ax = fig.add_subplot(1, 2, 1) sns.distplot(df['revenue'], ax=ax) # QQ ax2 = fig.add_subplot(1, 2, 2) stats.probplot(df['revenue'], plot=ax2) plt.show() # print(df['revenue'].describe()) print(""------------------------------"") print("": %f"" % df['revenue'].skew()) print("": %f"" % df['revenue'].kurt())'",Yes,5,33.0 "# dataframe df = train.copy() # Min-Max((1, 0)) scaler=MinMaxScaler() df['revenue']=scaler.fit_transform(df[['revenue']]) # fig = plt.figure(figsize=(10, 4)) plt.subplots_adjust(wspace=0.4) # ax = fig.add_subplot(1, 2, 1) sns.distplot(df['revenue'], ax=ax) # QQ ax2 = fig.add_subplot(1, 2, 2) stats.probplot(df['revenue'], plot=ax2) plt.show() # print(df['revenue'].describe()) print(""------------------------------"") print("": %f"" % df['revenue'].skew()) print("": %f"" % df['revenue'].kurt())'",Yes,3,33.0 "b""# \n# Open Date\ntrain['pd_date'] = pd.to_datetime(train['Open Date'], format='%m/%d/%Y')\n# \ntrain['Open_Year'] = train['pd_date'].dt.strftime('%Y')\n# \ntrain['Open_Month'] = train['pd_date'].dt.strftime('%m')\n\ntrain = train.drop('pd_date',axis=1)\ntrain = train.drop('Open Date',axis=1)""",Yes,3,8.0 "b""# \n# Open Date\ntest['pd_date'] = pd.to_datetime(test['Open Date'], format='%m/%d/%Y')\n# \ntest['Open_Year'] = test['pd_date'].dt.strftime('%Y')\n# \ntest['Open_Month'] = test['pd_date'].dt.strftime('%m')\n\ntest = test.drop('pd_date',axis=1)\ntest = test.drop('Open Date',axis=1)""",Yes,3,8.0 train.dtypes.value_counts(),No,5,72.0 "b""#\ncats = list(train.select_dtypes(include=['object']).columns)\nnums = list(train.select_dtypes(exclude=['object']).columns)\nprint(f'categorical variables: {cats}')\nprint(f'numerical variables: {nums}')""",No,3,37.0 train.nunique(axis=0),No,5,54.0 "columns = len(nominal_list)/2+1 fig = plt.figure(figsize=(30, 20)) plt.subplots_adjust(hspace=0.6, wspace=0.4) for i in range(len(nominal_list)): ax = fig.add_subplot(columns, 2, i+1) sns.countplot(x=nominal_list[i], data=train, ax=ax) plt.xticks(rotation=45) plt.show()",No,5,33.0 "columns = len(num_list)/3+1 fig = plt.figure(figsize=(30, 40)) plt.subplots_adjust(hspace=0.6, wspace=0.4) for i in range(len(num_list)): ax = fig.add_subplot(columns, 3, i+1) train[num_list[i]].hist(ax=ax) ax2 = train[num_list[i]].plot.kde(ax=ax, secondary_y=True,title=num_list[i]) ax2.set_ylim(0) plt.show()",No,5,33.0 "columns = len(nominal_list)/2+1 fig = plt.figure(figsize=(20, 10)) plt.subplots_adjust(hspace=0.6, wspace=0.4) for i in range(len(nominal_list)): ax = fig.add_subplot(columns, 2, i+1) # sns.boxplot(x=nominal_list[i], y=train.revenue, data=train, ax=ax) plt.xticks(rotation=45) # # sns.barplot(x = nominal_list[i], y = train.revenue, data=train, ax=ax) plt.show() '",No,5,33.0 "train = train.drop('Open_Month',axis=1) test= test.drop('Open_Month',axis=1) nominal_list.remove('Open_Month')",No,5,10.0 "b""columns = len(num_list)/4+1\n\nfig = plt.figure(figsize=(30, 35))\nplt.subplots_adjust(hspace=0.6, wspace=0.4)\n\nfor i in range(len(num_list)):\n ax = fig.add_subplot(columns, 4, i+1)\n\n # \n sns.regplot(x=num_list[i],y='revenue',data=train, ax=ax)\n plt.xticks(rotation=45)\n # \n# sns.barplot(x = nominal_list[i], y = train.revenue, data=train, ax=ax)\nplt.show()\n""",No,5,33.0 "train[['City','revenue']].groupby('City').mean().plot(kind='bar') plt.title('Mean Revenue Generated vs City') plt.xlabel('City') plt.ylabel('Mean Revenue Generated')",No,5,33.0 "b""# Cityrevenue1000000\nmean_revenue_per_city = train[['City', 'revenue']].groupby('City', as_index=False).mean()\nmean_revenue_per_city.head()\nmean_revenue_per_city['revenue'] = mean_revenue_per_city['revenue'].apply(lambda x: int(x/1e6)) \n\nmean_revenue_per_city\n\nmean_dict = dict(zip(mean_revenue_per_city.City, mean_revenue_per_city.revenue))\nmean_dict""",No,3,60.0 " print(train['City'].sort_values().unique())",No,5,57.0 "test['City'].sort_values().unique() ",No,5,57.0 "b""# City\ncity_train_list = list(train['City'].unique())\ncity_test_list = list(test['City'].unique())""",No,5,57.0 "# P1 # PP distinct_cities = train.loc[:, ""City""].unique() # Pcity means = [] for i in range(len(num_list)): temp = [] for city in distinct_cities: temp.append(train.loc[train.City == city, num_list[i]].mean()) means.append(temp) city_pvars = pd.DataFrame(columns=[""city_var"", ""means""]) for i in range(37): for j in range(len(distinct_cities)): city_pvars.loc[i+37*j] = [""P""+str(i+1), means[i][j]] print(city_pvars) # plt.rcParams['figure.figsize'] = (18.0, 6.0) sns.boxplot(x=""city_var"", y=""means"", data=city_pvars) # From this we observe that P1, P2, P11, P19, P20, P23, and P30 are approximately a good # proxy for geographical location.",No,5,53.0 "from sklearn import cluster def adjust_cities(full_full_data, train, k): # As found by box plot of each city's mean over each p-var relevant_pvars = [""P1"", ""P2"", ""P11"", ""P19"", ""P20"", ""P23"",""P30""] train = train.loc[:, relevant_pvars] # Optimal k is 20 as found by DB-Index plot kmeans = cluster.KMeans(n_clusters=k) kmeans.fit(train) # Get the cluster centers and classify city of each full_data instance to one of the centers full_data['City_Cluster'] = kmeans.predict(full_data.loc[:, relevant_pvars]) return full_data'",No,5,84.0 "num_train = train.shape[0] num_test = test.shape[0] print(num_train, num_test) full_data = pd.concat([train, test], ignore_index=True) ",No,4,11.0 "b""# \nfull_data = adjust_cities(full_data, train, 20)\nfull_data\n\n# City\nfull_data = full_data.drop(['City'], axis=1)""",No,3,8.0 "# Split into train and test datasets train = full_data[:num_train] test = full_data[num_train:] # check the shapes print(""Train :"",train.shape) print(""Test:"",test.shape) test",No,4,13.0 "train[['City_Cluster','revenue']].groupby('City_Cluster').mean().plot(kind='bar') plt.title('Mean Revenue Generated vs City Cluster') plt.xlabel('City Cluster') plt.ylabel('Mean Revenue Generated')",No,5,33.0 "mean_revenue_per_city = train[['City_Cluster', 'revenue']].groupby('City_Cluster', as_index=False).mean() mean_revenue_per_city.head() mean_revenue_per_city['revenue'] = mean_revenue_per_city['revenue'].apply(lambda x: int(x/1e6)) mean_revenue_per_city mean_dict = dict(zip(mean_revenue_per_city.City_Cluster, mean_revenue_per_city.revenue)) mean_dict",No,2,60.0 "b""city_rev = []\n\nfor i in full_data['City_Cluster']:\n for key, value in mean_dict.items():\n if i == key:\n city_rev.append(value)\n \ndf_city_rev = pd.DataFrame({'city_rev':city_rev})\nfull_data = pd.concat([full_data,df_city_rev],axis=1)\nfull_data.head\n\n# \nnominal_list.extend(['City_Cluster'])\n# \nnominal_list.remove('City')\n""",No,5,53.0 "from sklearn.preprocessing import LabelEncoder le = LabelEncoder() le_count = 0 # Iterate through the columns # for col in application_full_data: for i in range(len(nominal_list)): # if application_full_data[col].dtype == 'object': # If 2 or fewer unique categories if len(list(full_data[nominal_list[i]].unique())) <= 2: # full_data on the full_dataing data le.fit(full_data[nominal_list[i]]) # Transform both full_dataing and testing data full_data[nominal_list[i]] = le.transform(full_data[nominal_list[i]]) # Keep track of how many columns were label encoded le_count += 1 print('%d columns were label encoded.' % le_count)",No,5,20.0 "# one-hot encoding of categorical variables full_data = pd.get_dummies(full_data) print('full_dataing Features shape: ', full_data.shape)",No,4,20.0 "# # for col in num_list: # outliers = tukey_outliers(train[col]) # if len(outliers): # print(f""* {col} has these tukey outliers,\ {outliers}\ "") # else: # print(f""* {col} doesn't have any tukey outliers.\ "")'",No,5,53.0 "columns = len(num_list)/4+1 # boxplot fig = plt.figure(figsize=(15,20)) plt.subplots_adjust(hspace=0.2, wspace=0.8) for i in range(len(num_list)): ax = fig.add_subplot(columns, 4, i+1) sns.boxplot(y=full_data[num_list[i]], data=full_data, ax=ax) plt.show()",No,5,33.0 "skewed_data = train[num_list].apply(lambda x: skew(x)).sort_values(ascending=False) skewed_data[:10]",No,3,47.0 "# Split into train and test datasets train = full_data[:num_train] test = full_data[num_train:] # check the shapes print(""Train :"",train.shape) print(""Test:"",test.shape)",No,4,13.0 "sns.set(font_scale=1.1) correlation_train = train.corr() mask = np.triu(correlation_train.corr()) fig = plt.figure(figsize=(50,50)) sns.heatmap(correlation_train, annot=True, fmt='.1f', cmap='coolwarm', square=True, # mask=mask, linewidths=1) plt.show()",No,5,80.0 "# 10 train = train[cols] # train_X = train.drop(""revenue"",axis=1) train_y = train[""revenue""] #revenue train_y = np.log1p(train_y) # tmp_cols = train_X.columns test_X = test[tmp_cols] # print(""train_X: ""+str(train_X.shape)) print(""train_y: ""+str(train_y.shape)) print(""test_X: ""+str(test_X.shape))'",No,3,13.0 "# from sklearn.model_selection import train_test_split # (X_train, X_test, y_train, y_test) = train_test_split(train_X, train_y , test_size = 0.3 , random_state = 0) print(""X_train: ""+str(X_train.shape)) print(""X_test: ""+str(X_test.shape)) print(""y_train: ""+str(y_train.shape)) print(""y_test: ""+str(y_test.shape))'",No,4,13.0 "from sklearn.model_selection import GridSearchCV, cross_val_score, learning_curve from sklearn.metrics import mean_absolute_error from sklearn.linear_model import Lasso from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.linear_model import ElasticNet from sklearn.neighbors import KNeighborsRegressor from sklearn.svm import SVR from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import AdaBoostRegressor from sklearn.tree import DecisionTreeRegressor from xgboost import XGBRegressor",No,5,22.0 "# random_state = 2 classifiers = [] classifiers.append(Lasso(random_state=random_state)) classifiers.append(LinearRegression()) classifiers.append(Ridge(random_state=random_state)) classifiers.append(ElasticNet(random_state=random_state)) classifiers.append(KNeighborsRegressor()) classifiers.append(SVR()) classifiers.append(RandomForestRegressor(random_state=random_state)) classifiers.append(GradientBoostingRegressor()) classifiers.append(AdaBoostRegressor(random_state = random_state)) classifiers.append(DecisionTreeRegressor()) classifiers.append(XGBRegressor())'",No,5,4.0 "#classifier cv_results = [] for classifier in classifiers : cv_results.append(cross_val_score(classifier, X_train, y_train, scoring='neg_mean_squared_error', cv =10, n_jobs=4)) #classifier cv_means = [] cv_std = [] for cv_result in cv_results: cv_means.append(cv_result.mean()) cv_std.append(cv_result.std()) cv_res = pd.DataFrame({""CrossValMeans"":cv_means,""CrossValerrors"": cv_std,""Algorithm"":[""Lasso"",""LinearRegression"",""Ridge"", ""ElasticNet"",""KNeighborsRegressor"",""SVR"",""RandomForestRegressor"",""GradientBoostingRegressor"",""AdaBoostRegressor"",""DecisionTreeRegressor"", ""XGBRegressor""]})'",No,3,28.0 "g = sns.barplot(""CrossValMeans"",""Algorithm"",data = cv_res, palette=""Set3"",orient = ""h"",**{'xerr':cv_std}) g.set_xlabel(""Mean Accuracy"") g = g.set_title(""Cross validation scores"")'",No,5,84.0 "cv_res.sort_values(ascending=False, by='CrossValMeans')",No,5,9.0 "from sklearn import datasets from sklearn.linear_model import Ridge from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error import optuna def objective(trial): params = { 'alpha': trial.suggest_loguniform(""alpha"", 0.1, 5), 'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]), 'normalize': trial.suggest_categorical('normalize', [True, False]), } reg = Ridge(**params) reg.fit(X_train, y_train) y_pred = reg.predict(X_test) mae = mean_absolute_error(y_test, y_pred) return mae '",No,3,7.0 "b""# optuna \nstudy = optuna.create_study()\nstudy.optimize(objective, n_trials=100)\n\n# \nprint(f'best score: {study.best_value:.4f}, best params: {study.best_params}')""",No,4,2.0 "params = {'alpha': 1.9510706324753746, 'fit_intercept': True, 'normalize': True} reg = Ridge(**params) reg.fit(X_train, y_train) prediction_log = reg.predict(test_X) prediction =np.exp(prediction_log) print(prediction)",No,3,48.0 "# CSV(submission) submission = pd.DataFrame({""Id"":test_Id, ""Prediction"":prediction}) submission.to_csv(""submission.csv"", index=False)'",No,3,25.0 "%matplotlib inline import os import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import matplotlib from scipy import stats from scipy.stats import norm, skew from sklearn import preprocessing from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split from xgboost import XGBRegressor, plot_importance from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import KFold for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename))",No,3,22.0 "df = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/train.csv.zip') df.shape",No,4,45.0 "test_df = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/test.csv.zip') test_df.shape",No,5,45.0 df.head(),No,5,41.0 "def display_all(df): with pd.option_context(""display.max_rows"", 1000, ""display.max_columns"", 1000): display(df) display_all(df.head().transpose())",No,5,41.0 df.isnull().sum().sort_index()/len(df),No,5,39.0 "fig, ax = plt.subplots(1,2, figsize=(19, 5)) g1 = sns.countplot(df['Type'],palette=""Set2"", ax=ax[0]); g2 = sns.countplot(test_df['Type'],palette=""Set2"", ax=ax[1]); fig.show()'",No,5,33.0 "fig, ax = plt.subplots(1,2, figsize=(19, 5)) g1 = sns.countplot(df['City Group'],palette=""Set2"", ax=ax[0]); g2 = sns.countplot(test_df['City Group'],palette=""Set2"", ax=ax[1]); fig.show()'",No,5,33.0 "(df['City'].nunique(), test_df['City'].nunique())",No,5,54.0 "test_df.loc[test_df['Type']=='MB', 'Type'] = 'DT'",No,5,8.0 "df.drop('City', axis=1, inplace=True) test_df.drop('City', axis=1, inplace=True)",No,5,10.0 "import datetime df.drop('Id',axis=1,inplace=True) df['Open Date'] = pd.to_datetime(df['Open Date']) test_df['Open Date'] = pd.to_datetime(test_df['Open Date']) launch_date = datetime.datetime(2015, 3, 23) # scale days open df['Days Open'] = (launch_date - df['Open Date']).dt.days / 1000 test_df['Days Open'] = (launch_date - test_df['Open Date']).dt.days / 1000 df.drop('Open Date', axis=1, inplace=True) test_df.drop('Open Date', axis=1, inplace=True)",Yes,3,8.0 "plt.rc('figure', max_open_warning = 0) for i in range(1,38): fig, ax = plt.subplots(1,2, figsize=(19, 5)) g1 = sns.distplot(df['P{}'.format(i)], ax=ax[0], kde=False); g2 = sns.distplot(test_df['P{}'.format(i)], ax=ax[1], kde=False); fig.show()",No,5,33.0 df.dtypes,No,1,37.0 "b""(mu, sigma) = norm.fit(df['revenue'])\nf, (ax1, ax2) = plt.subplots(1, 2, figsize=(19, 5))\nax1 = sns.distplot(df['revenue'] , fit=norm, ax=ax1)\nax1.legend([f'Normal distribution ($\\mu=$ {mu:.3f} and $\\sigma=$ {sigma:.3f})'], loc='best')\nax1.set_ylabel('Frequency')\nax1.set_title('Revenue Distribution')\nax2 = stats.probplot(df['revenue'], plot=plt)\nf.show();""",No,5,33.0 "b""# Revenue is right skewed, taking the log will make it more normally distributed for the linear models\n# Remember to use expm1 on predictions to transform back to dollar amount\n(mu, sigma) = norm.fit(np.log1p(df['revenue']))\nf, (ax1, ax2) = plt.subplots(1, 2, figsize=(19, 5))\nax1 = sns.distplot(np.log1p(df['revenue']) , fit=norm, ax=ax1)\nax1.legend([f'Normal distribution ($\\mu=$ {mu:.3f} and $\\sigma=$ {sigma:.3f})'], loc='best')\nax1.set_ylabel('Frequency')\nax1.set_title('Log(1+Revenue) Distribution')\nax2 = stats.probplot(np.log(df['revenue']), plot=plt)\nf.show();""",No,4,33.0 "# Correlation between numeric features with revenue plt.figure(figsize=(10, 8)) sns.heatmap(df.drop(['revenue','City Group','Type'], axis=1).corr(), square=True) plt.suptitle('Pearson Correlation Heatmap') plt.show();",No,5,80.0 "corr_with_revenue = df.drop(['City Group','Type'],axis=1).corr()['revenue'].sort_values(ascending=False) plt.figure(figsize=(10,7)) corr_with_revenue.drop('revenue').plot.bar() plt.show();",No,5,33.0 "sns.pairplot(df[df.corr()['revenue'].sort_values(ascending=False).index[:5]]) plt.show();",No,5,81.0 "# copy_df = df.copy() # copy_test_df = test_df.copy() # numeric_features = df.dtypes[df.dtypes != ""object""].index # skewed_features = df[numeric_features].apply(lambda x: skew(x)) # skewed_features = skewed_features[skewed_features > 0.5].index # df[skewed_features] = np.log1p(df[skewed_features]) # test_df[skewed_features.drop('revenue')] = np.log1p(test_df[skewed_features.drop('revenue')]) # Above handles skewed features using log transformation # Below uses multiple imputation for P1-P37, since they are actually categorical from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer imp_train = IterativeImputer(max_iter=30, missing_values=0, sample_posterior=True, min_value=1, random_state=37) imp_test = IterativeImputer(max_iter=30, missing_values=0, sample_posterior=True, min_value=1, random_state=23) p_data = ['P'+str(i) for i in range(1,38)] df[p_data] = np.round(imp_train.fit_transform(df[p_data])) test_df[p_data] = np.round(imp_test.fit_transform(test_df[p_data]))'",No,4,17.0 "# drop_first=True for Dummy Encoding for object types, and drop_first=False for OHE columnsToEncode = df.select_dtypes(include=[object]).columns df = pd.get_dummies(df, columns=columnsToEncode, drop_first=False) test_df = pd.get_dummies(test_df, columns=columnsToEncode, drop_first=False)",No,5,20.0 "df['revenue'] = np.log1p(df['revenue']) X, y = df.drop('revenue', axis=1), df['revenue'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=118)",No,3,13.0 "from sklearn.model_selection import GridSearchCV from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso",No,5,22.0 "params_ridge = { 'alpha' : [.01, .1, .5, .7, .9, .95, .99, 1, 5, 10, 20], 'fit_intercept' : [True, False], 'normalize' : [True,False], 'solver' : ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'] } ridge_model = Ridge() ridge_regressor = GridSearchCV(ridge_model, params_ridge, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1) ridge_regressor.fit(X_train, y_train) print(f'Optimal alpha: {ridge_regressor.best_params_[""alpha""]:.2f}') print(f'Optimal fit_intercept: {ridge_regressor.best_params_[""fit_intercept""]}') print(f'Optimal normalize: {ridge_regressor.best_params_[""normalize""]}') print(f'Optimal solver: {ridge_regressor.best_params_[""solver""]}') print(f'Best score: {ridge_regressor.best_score_}')'",No,4,6.0 "ridge_model = Ridge(alpha=ridge_regressor.best_params_[""alpha""], fit_intercept=ridge_regressor.best_params_[""fit_intercept""], normalize=ridge_regressor.best_params_[""normalize""], solver=ridge_regressor.best_params_[""solver""]) ridge_model.fit(X_train, y_train) y_train_pred = ridge_model.predict(X_train) y_pred = ridge_model.predict(X_test) print('Train r2 score: ', r2_score(y_train_pred, y_train)) print('Test r2 score: ', r2_score(y_test, y_pred)) train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train)) test_rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print(f'Train RMSE: {train_rmse:.4f}') print(f'Test RMSE: {test_rmse:.4f}')'",No,3,28.0 "# Ridge Model Feature Importance ridge_feature_coef = pd.Series(index = X_train.columns, data = np.abs(ridge_model.coef_)) ridge_feature_coef.sort_values().plot(kind = 'bar', figsize = (13,5));",No,3,79.0 "params_lasso = { 'alpha' : [.01, .1, .5, .7, .9, .95, .99, 1, 5, 10, 20], 'fit_intercept' : [True, False], 'normalize' : [True,False], } lasso_model = Lasso() lasso_regressor = GridSearchCV(lasso_model, params_lasso, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1) lasso_regressor.fit(X_train, y_train) print(f'Optimal alpha: {lasso_regressor.best_params_[""alpha""]:.2f}') print(f'Optimal fit_intercept: {lasso_regressor.best_params_[""fit_intercept""]}') print(f'Optimal normalize: {lasso_regressor.best_params_[""normalize""]}') print(f'Best score: {lasso_regressor.best_score_}')'",No,4,6.0 "lasso_model = Lasso(alpha=lasso_regressor.best_params_[""alpha""], fit_intercept=lasso_regressor.best_params_[""fit_intercept""], normalize=lasso_regressor.best_params_[""normalize""]) lasso_model.fit(X_train, y_train) y_train_pred = lasso_model.predict(X_train) y_pred = lasso_model.predict(X_test) print('Train r2 score: ', r2_score(y_train_pred, y_train)) print('Test r2 score: ', r2_score(y_test, y_pred)) train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train)) test_rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print(f'Train RMSE: {train_rmse:.4f}') print(f'Test RMSE: {test_rmse:.4f}')'",No,4,28.0 "# Lasso Model Feature Importance lasso_feature_coef = pd.Series(index = X_train.columns, data = np.abs(lasso_model.coef_)) lasso_feature_coef.sort_values().plot(kind = 'bar', figsize = (13,5));",No,3,79.0 "from sklearn.linear_model import ElasticNetCV, ElasticNet # Use ElasticNetCV to tune alpha automatically instead of redundantly using ElasticNet and GridSearchCV el_model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], eps=5e-2, cv=10, n_jobs=-1) el_model.fit(X_train, y_train) print(f'Optimal alpha: {el_model.alpha_:.6f}') print(f'Optimal l1_ratio: {el_model.l1_ratio_:.3f}') print(f'Number of iterations {el_model.n_iter_}')",No,4,6.0 "y_train_pred = el_model.predict(X_train) y_pred = el_model.predict(X_test) print('Train r2 score: ', r2_score(y_train_pred, y_train)) print('Test r2 score: ', r2_score(y_test, y_pred)) train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train)) test_rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print(f'Train RMSE: {train_rmse:.4f}') print(f'Test RMSE: {test_rmse:.4f}')",No,3,28.0 "# ElasticNet Model Feature Importance el_feature_coef = pd.Series(index = X_train.columns, data = np.abs(el_model.coef_)) n_features = (el_feature_coef>0).sum() print(f'{n_features} features with reduction of {(1-n_features/len(el_feature_coef))*100:2.2f}%') el_feature_coef.sort_values().plot(kind = 'bar', figsize = (13,5));",No,5,79.0 "import numpy as np import pandas as pd from matplotlib import pyplot as plt import seaborn as sns import os from math import sqrt",No,5,22.0 "for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename))",No,5,88.0 "PATH='/kaggle/input/restaurant-revenue-prediction' train_df=pd.read_csv(os.path.join(PATH,'train.csv.zip')) test_df=pd.read_csv(os.path.join(PATH,'test.csv.zip'))",No,5,45.0 "print('Train Data Shape:',train_df.shape) print('Test Data Shape:',test_df.shape) print('Features:',train_df.columns)",No,4,58.0 train_df.head(),No,5,41.0 "sns.distplot(train_df['revenue'],hist=False) plt.title('Target Variable Distribution') plt.show()",No,5,33.0 train_df.isnull().sum(),No,5,39.0 "def get_month(date): return int(date.split('/')[0]) def get_year(date): return int(date.split('/')[-1]) train_df['Month']=train_df['Open Date'].apply(get_month) train_df['Year']=train_df['Open Date'].apply(get_year)",No,5,8.0 "test_df['Month']=test_df['Open Date'].apply(get_month) test_df['Year']=test_df['Open Date'].apply(get_year)",No,5,8.0 "print(train_df['Id'].shape) # the id has all unique values hence removing it train_df.drop('Id',axis=1,inplace=True) test_indexes=test_df['Id'] test_df.drop('Id',axis=1,inplace=True)",No,3,10.0 "plt.figure(figsize=(10,5)) sns.countplot(x='Month',data=train_df) plt.xlabel('Opening Month') plt.ylabel('Openings') plt.title('No of openings per month') plt.show()",No,5,75.0 "plt.figure(figsize=(13,5)) months_revenue_mean=train_df.groupby('Month')['revenue'].mean() sns.pointplot(x=months_revenue_mean.index,y=months_revenue_mean.values) plt.title('Revenue Vs Month') plt.show()",No,5,33.0 "(train_df['Month']=='05').sum() (train_df['Month']=='06').sum() (train_df['Month']=='07').sum()",No,5,72.0 "plt.figure(figsize=(13,5)) sns.countplot(x='Year',data=train_df) plt.ylabel('Number of Openings') plt.title('Number Of Openings Per Year') plt.show()",No,5,75.0 "plt.figure(figsize=(14,5)) year_revenue_means=train_df.groupby('Year')['revenue'].mean() sns.pointplot(year_revenue_means.index,year_revenue_means.values) plt.xlabel('Revenue') plt.ylabel('Year') plt.title('Revenue Per Year') plt.show()",No,5,33.0 "print('Datapoints in Year 2013:',(train_df['Year']=='2013').sum()) print('Datapoints in Year 2014:',(train_df['Year']=='2014').sum())",No,5,40.0 "print(""City Group Categoies:"",train_df['City Group'].unique())'",No,5,57.0 "sns.countplot('City Group', data=train_df) plt.title('City Group Counts') plt.show()",No,5,33.0 train_df['City Group'].value_counts(),No,5,72.0 "city_group_revenue_means=train_df.groupby('City Group')['revenue'].sum() city_group_revenue_means",No,5,60.0 "sns.lineplot(x='City Group',y='revenue',data=train_df)",No,5,81.0 "## converting it into dummies city_group_dummies=pd.get_dummies(train_df['City Group']) train_df=pd.concat([train_df,city_group_dummies],axis=1)",No,4,11.0 "test_city_group_dummies=pd.get_dummies(test_df['City Group']) test_df=pd.concat([test_df,test_city_group_dummies],axis=1)",No,4,11.0 "print('Tyes in train df:',train_df['Type'].unique()) print('Types in test df:',test_df['Type'].unique())",No,5,57.0 "fig,ax = plt.subplots(1,2,figsize=(9,5)) sns.countplot(train_df.Type,ax=ax[0]) ax[0].set_title('Train set') sns.countplot(test_df.Type,ax=ax[1]) ax[1].set_title('Test set') plt.show()",No,5,33.0 "type_map={'IL':0,'FC':1,'DT':2,'MB':3} train_df['Type']=train_df['Type'].apply(lambda type:type_map[type]) test_df['Type']=test_df['Type'].apply(lambda type:type_map[type])",No,4,20.0 "## converting the type into dummies type_dummies=pd.get_dummies(train_df['Type']) train_df=pd.concat([train_df,type_dummies],axis=1) train_df['3']=[0]*train_df.shape[0]",No,4,11.0 "test_type_dummies=pd.get_dummies(test_df['Type']) test_df=pd.concat([test_df,test_type_dummies],axis=1)",No,4,11.0 train_df['City'].unique() ,No,5,57.0 "# dropping all the columns which have been utilized already train_df.drop(['Open Date','City','City Group','Type'],axis=1,inplace=True) test_df.drop(['Open Date','City','City Group','Type'],axis=1,inplace=True)",No,5,10.0 test_df.head(),No,5,41.0 "from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVR from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score from sklearn.model_selection import KFold",No,5,22.0 "print('Train Data Shape After EDA:',train_df.shape) print('Test Data Shape After EDA:',test_df.shape)",No,5,58.0 "Y=train_df['revenue'] train_df.drop('revenue',axis=1,inplace=True) X=train_df.values X_Test=test_df.values",No,5,21.0 "X.shape,X_Test.shape",No,5,58.0 "regressor_models={ 'Linear Regression':LinearRegression(), 'Decision Tree Regressor':DecisionTreeRegressor(), 'Random Forest Regressor':RandomForestRegressor(), 'SVR':SVR(), }",No,3,23.0 "def get_rmse_score(model,x_train,x_test,y_train,y_test): model.fit(x_train,y_train) y_predicted=model.predict(x_test) r2_score=model.score(x_test,y_test) rmse=sqrt(mean_squared_error(y_test,y_predicted)) return rmse,r2_score",No,4,28.0 "x_train,x_test,y_train,y_test=train_test_split(X,Y,random_state=42)",No,5,13.0 "svr_model=SVR() svr_model.fit(x_train,y_train) Y_Test_predictions=svr_model.predict(X_Test)",No,3,27.0 "predictions=[] for index in range(len(Y_Test_predictions)): predictions.append([test_indexes[index],Y_Test_predictions[index]])",No,3,48.0 "predictions_df=pd.DataFrame(predictions,columns=['Id','Prediction']) predictions_df.to_csv('SVM_Predictions.csv',index=False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the read-only ""../input/"" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,4,22.0 "import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import warnings warnings.filterwarnings(""ignore"")",No,5,23.0 "train_path = '../input/restaurant-revenue-prediction/train.csv.zip' df_train = pd.read_csv(train_path) df_train.head()",No,3,41.0 "test_path = '../input/restaurant-revenue-prediction/test.csv.zip' df_test = pd.read_csv(test_path, index_col='Id') df_test.head()",No,3,41.0 "from datetime import date, datetime def calculate_age(born): born = datetime.strptime(born, ""%m/%d/%Y"").date() today = date.today() return today.year - born.year - ((today.month, today.day) < (born.month, born.day)) df_train['Age'] = df_train['Open Date'].apply(calculate_age) df_test['Age'] = df_test['Open Date'].apply(calculate_age) # Drop 'Open Date' column from Dataframes df_train = df_train.drop('Open Date', axis=1) df_test = df_test.drop('Open Date', axis=1) # Drop 'Id' column from Dataframes df_train = df_train.drop('Id', axis=1) df_train.head()'",No,3,41.0 "# Find the sets of categorical variables and numberical variales for feature analyses numerical_features = df_train.select_dtypes([np.number]).columns.tolist() categorical_features = df_train.select_dtypes(exclude = [np.number,np.datetime64]).columns.tolist() print(categorical_features) print(numerical_features)",No,3,40.0 "print(df_train['revenue'].describe()) sns.distplot(a=df_train['revenue'], kde=True).set(xlabel='revenue', ylabel='P(revenue)')",No,5,33.0 df_train[df_train['revenue'] > 10000000 ],No,5,14.0 "# Drop outliers df_train = df_train[df_train['revenue'] < 10000000 ] df_train.reset_index(drop=True).head()",No,3,10.0 "fig, ax = plt.subplots(3, 1, figsize=(40, 30)) for variable, subplot in zip(categorical_features, ax.flatten()): df_2 = df_train[[variable,'revenue']].groupby(variable).revenue.sum().reset_index() df_2.columns = [variable,'total_revenue'] sns.barplot(x=variable, y='total_revenue', data=df_2 , ax=subplot) subplot.set_xlabel(variable,fontsize=20) subplot.set_ylabel('Total Revenue',fontsize=20) for label in subplot.get_xticklabels(): label.set_rotation(45) label.set_size(20) for label in subplot.get_yticklabels(): label.set_size(20) fig.tight_layout()",No,5,33.0 "fig, ax = plt.subplots(13, 3, figsize=(30, 35)) for variable, subplot in zip(numerical_features, ax.flatten()): sns.regplot(x=df_train[variable], y=df_train['revenue'], ax=subplot) subplot.set_xlabel(variable,fontsize=20) subplot.set_ylabel('Revenue',fontsize=20) fig.tight_layout()",No,5,33.0 "plt.figure(figsize=(45,25)) mask = np.triu(np.ones_like(df_train.corr(), dtype=np.bool)) sns.heatmap(df_train.corr(),annot=True, mask=mask) sns.set(font_scale=1.4)",No,5,80.0 "fig, ax = plt.subplots(3, 1, figsize=(40, 30)) for variable, subplot in zip(categorical_features, ax.flatten()): sns.swarmplot(x=variable, y='revenue', data=df_train, ax=subplot,size=10) subplot.set_xlabel(variable,fontsize=20) subplot.set_ylabel('Revenue',fontsize=20) for label in subplot.get_xticklabels(): label.set_rotation(45) label.set_size(18) for label in subplot.get_yticklabels(): label.set_size(18) fig.tight_layout()",No,5,33.0 "#Lets take a look at city group field print(""--- Train set ---"") print(df_train['City Group'].value_counts()) print(""---- Test set ----"") print(df_test['City Group'].value_counts())'",No,5,72.0 "#Lets take a look at type field print(""--- Train set ---"") print(df_train['Type'].value_counts()) print(""---- Test set ----"") print(df_test['Type'].value_counts()) '",No,5,72.0 "y = df_train['revenue'] df_train=df_train.drop('revenue', axis=1)",No,5,21.0 "print(""Shapes: Train set "", df_train.shape ,"", Test "",df_test.shape) df_full = pd.concat([df_train,df_test]) print(""Full dataset shapes: "", df_full.shape)",No,4,58.0 print('There are {} cities which restaurant location have been collected.'.format(len(df_full['City'].unique()))),No,3,58.0 "df_full = df_full.drop('City', axis=1) df_full.shape",No,4,10.0 "p_name = ['P'+str(i) for i in range(1,38)]",No,3,58.0 "from sklearn.decomposition import PCA pca = PCA().fit(df_full[p_name]) plt.figure(figsize=(7,5)) plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('number of Components') plt.ylabel('Explained variance') plt.yticks(np.arange(0.1,1.1,0.05)) plt.xticks(np.arange(0,41,2)) plt.grid(True)",No,3,23.0 "pca_list = ['pca'+str(i) for i in range(1,30,1)] df_full[pca_list] = PCA(n_components=29).fit_transform(df_full[p_name]) df_full.drop(p_name,axis=1,inplace=True)",No,3,33.0 df_full.info(),No,4,10.0 "df=pd.get_dummies(df_full, dtype=float)",No,3,40.0 "# Get number of train sets numTrain=df_train.shape[0] train = df[:numTrain] test = df[numTrain:]",No,3,13.0 "sns.distplot(a=y, kde=True).set(xlabel='revenue', ylabel='P(revenue)')",No,5,33.0 "print(""Kurtosis: {}"".format(y.kurt())) print(""Skewness: {}"".format(y.skew()))",No,5,40.0 "from sklearn.model_selection import GridSearchCV from sklearn.linear_model import Lasso, Ridge, ElasticNet from sklearn.ensemble import AdaBoostRegressor from xgboost import XGBRegressor ",No,4,21.0 best_estimators=[],No,5,77.0 "## Parameters params = { ""alpha"" : [.01, .1, .5, .7, .9, .95, .99, 1, 5, 10, 20], ""fit_intercept"" : [True, False], ""normalize"" : [True,False], ""solver"" : ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], ""tol"" : [0.0001, 0.001, 0.01, 0.1], ""random_state"" : [42] } ## Ridge ridge = Ridge() ridge_grid = GridSearchCV(ridge, params, scoring='r2', cv=5, n_jobs=-1) ridge_grid.fit(X_train, y_train) ## Output print(""Best parameters: {}:"".format(ridge_grid.best_params_)) print(""Best score: {}"".format(ridge_grid.best_score_)) ## Append to list best_estimators.append([""Ridge"",ridge_grid.best_estimator_])'",No,4,23.0 "params = { 'alpha' : [.01, .1, .5, .7, .9, .95, .99, 1, 5, 10, 20], 'fit_intercept' : [True, False], 'normalize' : [True,False], 'tol' : [0.0001, 0.001, 0.01, 0.1], ""random_state"" : [42] } ## Lasso lasso = Lasso() lasso_grid = GridSearchCV(lasso, params, scoring='r2', cv=5, n_jobs=-1) lasso_grid.fit(X_train, y_train) ## Output print(""Best parameters: {}:"".format(lasso_grid.best_params_)) print(""Best score: {}"".format(lasso_grid.best_score_)) ## Append to list best_estimators.append([""Lasso"",lasso_grid.best_estimator_])'",No,3,5.0 "# Parameters params = { ""alpha"" : [.01, .1, .5, .7, .9, .95, .99, 1, 5, 10, 20], ""fit_intercept"" : [True, False], ""normalize"" : [True,False], ""tol"" : [0.0001, 0.001, 0.01, 0.1], ""random_state"" : [42] } ## Elastic Net EL = ElasticNet() EL_grid = GridSearchCV(EL, params, scoring='r2', cv=5, n_jobs=-1) EL_grid.fit(X_train, y_train) ## Output print(""Best parameters: {}:"".format(EL_grid.best_params_)) print(""Best score: {}"".format(EL_grid.best_score_)) ## Append to list best_estimators.append([""ElasticNet"",EL_grid.best_estimator_])'",No,3,5.0 "# parameters params = { ""learning_rate"": [.1, .5, .7, .9, .95, .99, 1], ""colsample_bytree"": [.3, .4, .5, .6], ""max_depth"": [2, 4], ""alpha"": [1, 3, 5], ""subsample"": [.5], ""n_estimators"": [30, 70, 100, 200], ""random_state"" : [42] } ## XGBoost Regressor XGBR = XGBRegressor() XGBR_grid = GridSearchCV(XGBR, params, scoring='r2', cv=5, n_jobs=-1) XGBR_grid.fit(X_train, y_train) ## Output print(""Best parameters: {}:"".format(XGBR_grid.best_params_)) print(""Best score: {}"".format(XGBR_grid.best_score_)) ## Append to list best_estimators.append([""XGBoostR"",XGBR_grid.best_estimator_])'",No,3,5.0 "## parameters params = { ""n_estimators"": [10, 30, 50, 100], ""learning_rate"": [.01, 0.1, 0.5, 0.9, 0.95, 1], ""random_state"" : [42] } ## XGBoost Regressor AdaBoostR = AdaBoostRegressor() AdaBoostR_grid = GridSearchCV(AdaBoostR, params, scoring='r2', cv=5, n_jobs=-1) AdaBoostR_grid.fit(X_train, y_train) ## Output print(""Best parameters: {}:"".format(AdaBoostR_grid.best_params_)) print(""Best score: {}"".format(AdaBoostR_grid.best_score_)) ## Append to list best_estimators.append([""AdaBoostR"",AdaBoostR_grid.best_estimator_])'",No,3,5.0 "from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler ## pipeline pipelines = [] for name,model in best_estimators: pipeline = Pipeline([(""Scaler"",StandardScaler()), (name,model) ]) pipelines.append([""Scaled_""+name,pipeline])",No,3,5.0 "from sklearn.model_selection import KFold, cross_val_score ## Create a dataframe to store all the models' cross validation score evaluate = pd.DataFrame(columns=[""model"",""cv"",""std""]) ## Encoded dataset for name,model in pipelines: kfold = KFold(n_splits=10,random_state=42) cv = cross_val_score(model, X_train, y_train, cv=kfold, n_jobs=-1, scoring=""neg_root_mean_squared_error"") row = evaluate.shape[0] evaluate.loc[row,""model""] = name evaluate.loc[row,""cv""] = round(cv.mean(),3) evaluate.loc[row,""std""] = ""+/- {}"".format(round(cv.std(),4)) evaluate = evaluate.sort_values(""cv"",ascending=False) evaluate'",No,2,22.0 from sklearn.ensemble import VotingRegressor,No,3,56.0 "## Creating a list for all combinations models votings = [] ## All models votings.append((""Scaled_all_models"",Pipeline([(""Scaler"",StandardScaler()), (""Votings"",VotingRegressor([(""XGBoostR"",XGBR_grid.best_estimator_), (""AdaBoostR"", AdaBoostR_grid.best_estimator_), (""Ridge"",ridge_grid.best_estimator_) ]) )]))) ### Combinations of two estimators ## Combination of RandomForestRegressor with BaggingRegressor & GradientBoostRegressor votings.append((""Scaled_XGBR_AB"",Pipeline([(""Scaler"",StandardScaler()), (""Votings"",VotingRegressor([(""XGBoostR"",XGBR_grid.best_estimator_), (""AdaBoostR"", AdaBoostR_grid.best_estimator_) ]))]) )) votings.append((""Scaled_XGBR_R"",Pipeline([(""Scaler"",StandardScaler()), (""Votings"",VotingRegressor([(""XGBoostR"",XGBR_grid.best_estimator_), (""Ridge"",ridge_grid.best_estimator_) ]))]))) votings.append((""Scaled_AB_R"",Pipeline([(""Scaler"",StandardScaler()), (""Votings"",VotingRegressor([(""AdaBoostR"", AdaBoostR_grid.best_estimator_), (""Ridge"",ridge_grid.best_estimator_) ]))])))",No,5,82.0 !pip install python-googlegeocoder,No,4,22.0 "import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from googlegeocoder import GoogleGeocoder from bokeh.plotting import figure, save import plotly.graph_objects as go import sklearn import warnings warnings.filterwarnings(""ignore"") plt.style.use('ggplot') pd.plotting.register_matplotlib_converters()'",No,5,23.0 "train_df=pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv') test_df=pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv') all_data=pd.concat([train_df,test_df],axis=0) all_data.reset_index(drop=True)",No,3,45.0 "train_df.info() ",No,5,40.0 train_df['Country_Region'].nunique(),No,5,54.0 "print(""fill blanks and add region for counting"") train_df.drop('Province_State',axis=1,inplace=True)'",No,5,10.0 "# Resetting Date column into Datetime object and making it an index of dataframe train_df['Date']=pd.to_datetime(train_df['Date']) train_df.set_index('Date',inplace=True)",No,4,16.0 "from sklearn.neighbors import KNeighborsRegressor params_knn = { 'n_neighbors' : [3, 5, 7, 9, 11], } knn_model = KNeighborsRegressor() knn_regressor = GridSearchCV(knn_model, params_knn, scoring='neg_root_mean_squared_error', cv=10, n_jobs=-1) knn_regressor.fit(X_train, y_train) print(f'Optimal neighbors: {knn_regressor.best_params_[""n_neighbors""]}') print(f'Best score: {knn_regressor.best_score_}')'",No,4,6.0 "knn_model = KNeighborsRegressor(n_neighbors=knn_regressor.best_params_[""n_neighbors""]) knn_model.fit(X_train, y_train) y_train_pred = knn_model.predict(X_train) y_pred = knn_model.predict(X_test) print('Train r2 score: ', r2_score(y_train_pred, y_train)) print('Test r2 score: ', r2_score(y_test, y_pred)) train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train)) test_rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print(f'Train RMSE: {train_rmse:.4f}') print(f'Test RMSE: {test_rmse:.4f}')'",No,3,7.0 "from sklearn.ensemble import RandomForestRegressor params_rf = { 'max_depth': [10, 30, 35, 50, 65, 75, 100], 'max_features': [.3, .4, .5, .6], 'min_samples_leaf': [3, 4, 5], 'min_samples_split': [8, 10, 12], 'n_estimators': [30, 50, 100, 200] } rf = RandomForestRegressor() rf_regressor = GridSearchCV(rf, params_rf, scoring='neg_root_mean_squared_error', cv = 10, n_jobs = -1) rf_regressor.fit(X_train, y_train) print(f'Optimal depth: {rf_regressor.best_params_[""max_depth""]}') print(f'Optimal max_features: {rf_regressor.best_params_[""max_features""]}') print(f'Optimal min_sample_leaf: {rf_regressor.best_params_[""min_samples_leaf""]}') print(f'Optimal min_samples_split: {rf_regressor.best_params_[""min_samples_split""]}') print(f'Optimal n_estimators: {rf_regressor.best_params_[""n_estimators""]}') print(f'Best score: {rf_regressor.best_score_}')'",No,5,2.0 "rf_model = RandomForestRegressor(max_depth=rf_regressor.best_params_[""max_depth""], max_features=rf_regressor.best_params_[""max_features""], min_samples_leaf=rf_regressor.best_params_[""min_samples_leaf""], min_samples_split=rf_regressor.best_params_[""min_samples_split""], n_estimators=rf_regressor.best_params_[""n_estimators""], n_jobs=-1, oob_score=True) rf_model.fit(X_train, y_train) y_train_pred = rf_model.predict(X_train) y_pred = rf_model.predict(X_test) print('Train r2 score: ', r2_score(y_train_pred, y_train)) print('Test r2 score: ', r2_score(y_test, y_pred)) train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train)) test_rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print(f'Train RMSE: {train_rmse:.4f}') print(f'Test RMSE: {test_rmse:.4f}')'",No,3,7.0 "# # City, City Group, Type from sklearn.preprocessing import LabelEncoder le = LabelEncoder() df_all[""City""] = le.fit_transform(df_all[""City""]) df_all[""City Group""] = le.fit_transform(df_all[""City Group""]) df_all[""Type""] = le.fit_transform(df_all[""Type""]) # df_all'",No,4,20.0 "# df_train_fin = df_all.iloc[:df_train.shape[0]] # df_train df_test_fin = df_all.iloc[df_train.shape[0]:] # df_test'",No,5,13.0 "from sklearn.ensemble import RandomForestRegressor # IDOpenDate out_columns = [""Id"", ""Open Date""] columns = [] for i in df_train_fin.columns: if i not in out_columns: columns.append(i) x_train = df_train_fin[columns] # rfr = RandomForestRegressor( n_estimators=200, max_depth=5, max_features=0.5, random_state=449, n_jobs=-1 ) rfr.fit(x_train, y_train) # rfr.score(x_train, y_train)'",No,3,7.0 pred = rfr.predict(df_test_fin[columns]),No,5,48.0 df_submission,No,5,41.0 "df_submission['Prediction'] = pred df_submission.to_csv('/kaggle/working/RandamForest_submission01.csv', index=False)",No,4,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from matplotlib import pyplot as plt import seaborn as sns # from pandas_profiling import ProfileReport # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # Any results you write to the current directory are saved as output.",No,5,88.0 "train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv') test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv') submission = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv')",No,5,45.0 "from plotly.offline import iplot from plotly import tools import plotly.graph_objects as go import plotly.express as px import plotly.offline as py import plotly.figure_factory as ff from plotly.subplots import make_subplots import plotly.io as pio pio.templates.default = ""plotly_dark"" py.init_notebook_mode(connected=True)",No,5,23.0 "latest_grouped = train.groupby('Country_Region')['ConfirmedCases', 'Fatalities'].sum().reset_index()",No,5,60.0 "fig = px.bar(latest_grouped.sort_values('ConfirmedCases', ascending=False)[:20][::-1], x='ConfirmedCases', y='Country_Region', title='Confirmed Cases Worldwide', text='ConfirmedCases', height=1000, orientation='h') fig.show()",No,5,33.0 "europe = list(['Austria','Belgium','Bulgaria','Croatia','Cyprus','Czechia','Denmark','Estonia','Finland','France','Germany','Greece','Hungary','Ireland', 'Italy', 'Latvia','Luxembourg','Lithuania','Malta','Norway','Netherlands','Poland','Portugal','Romania','Slovakia','Slovenia', 'Spain', 'Sweden', 'United Kingdom', 'Iceland', 'Russia', 'Switzerland', 'Serbia', 'Ukraine', 'Belarus', 'Albania', 'Bosnia and Herzegovina', 'Kosovo', 'Moldova', 'Montenegro', 'North Macedonia']) europe_grouped_latest = latest_grouped[latest_grouped['Country_Region'].isin(europe)]",No,5,14.0 "temp = train[train['Country_Region'].isin(europe)] temp = temp.groupby(['Date', 'Country_Region'])['ConfirmedCases'].sum().reset_index() temp['Date'] = pd.to_datetime(temp['Date']).dt.strftime('%m/%d/%Y') temp['size'] = temp['ConfirmedCases'].pow(0.3) * 3.5 fig = px.scatter_geo(temp, locations=""Country_Region"", locationmode='country names', color=""ConfirmedCases"", size='size', hover_name=""Country_Region"", range_color=[1,100],scope='europe', projection=""natural earth"", animation_frame=""Date"", title='COVID-19: Cases Over Time', color_continuous_scale='Cividis_r') fig.show()'",No,5,33.0 "fig = px.bar(europe_grouped_latest.sort_values('ConfirmedCases', ascending=False)[:10][::-1], x='ConfirmedCases', y='Country_Region', color_discrete_sequence=['#84DCC6'], title='Confirmed Cases in Europe', text='ConfirmedCases', orientation='h') fig.show()",No,5,33.0 "usa = train[train['Country_Region'] == ""US""] usa_latest = usa[usa['Date'] == max(usa['Date'])] usa_latest = usa_latest.groupby('Province_State')['ConfirmedCases', 'Fatalities'].max().reset_index() fig = px.bar(usa_latest.sort_values('ConfirmedCases', ascending=False)[:10][::-1], x='ConfirmedCases', y='Province_State', color_discrete_sequence=['#D63230'], title='Confirmed Cases in USA', text='ConfirmedCases', orientation='h') fig.show()'",No,5,33.0 "ch = train[train['Country_Region'] == ""China""] ch = ch[ch['Date'] == max(ch['Date'])] ch = ch.groupby('Province_State')['ConfirmedCases', 'Fatalities'].max().reset_index() fig = px.bar(ch.sort_values('ConfirmedCases', ascending=False)[:10][::-1], x='ConfirmedCases', y='Province_State', color_discrete_sequence=['#D63230'], title='Confirmed Cases in china', text='ConfirmedCases', orientation='h') fig.show()'",No,5,33.0 "province_encoded = {state:index for index, state in enumerate(train['Province_State'].unique())}",No,5,77.0 "train['province_encoded'] = train['Province_State'].apply(lambda x: province_encoded[x]) train.head()",No,4,20.0 "country_encoded = dict(enumerate(train['Country_Region'].unique())) country_encoded = dict(map(reversed, country_encoded.items()))",No,5,20.0 "train['country_encoded'] = train['Country_Region'].apply(lambda x: country_encoded[x]) train.head()",No,4,20.0 "from datetime import datetime import time",No,5,22.0 "train['Mon'] = train['Date'].apply(lambda x: int(x.split('-')[1])) train['Day'] = train['Date'].apply(lambda x: int(x.split('-')[2]))",No,5,8.0 "train['serial'] = train['Mon'] * 30 + train['Day'] train.head()",No,4,8.0 train['serial'] = train['serial'] - train['serial'].min(),No,5,8.0 "gdp2020 = pd.read_csv('/kaggle/input/gdp2020/GDP2020.csv') population2020 = pd.read_csv('/kaggle/input/population2020/population2020.csv')",No,5,45.0 "gdp2020 = gdp2020.rename(columns={""rank"":""rank_gdp""}) gdp2020_numeric_list = [list(gdp2020)[0]] + list(gdp2020)[2:-1] gdp2020.head()",No,4,61.0 set(train['Country_Region']) - set(population2020['name']),No,5,57.0 set(train['Country_Region']) - set(gdp2020['country']),No,5,57.0 "population2020 = population2020.rename(columns={""rank"":""rank_pop""}) population2020_numeric_list = [list(population2020)[0]] + list(gdp2020)[2:] population2020.head()",No,4,61.0 "train = pd.merge(train, population2020, how='left', left_on = 'Country_Region', right_on = 'name') train = pd.merge(train, gdp2020, how='left', left_on = 'Country_Region', right_on = 'country')",No,5,32.0 train = train.fillna(-1),No,5,17.0 "# numeric_features_X = ['Lat','Long', 'province_encoded' ,'country_encoded','Mon','Day'] numeric_features_X = ['province_encoded' ,'country_encoded','Mon','Day'] + population2020_numeric_list + gdp2020_numeric_list numeric_features_Y = ['ConfirmedCases', 'Fatalities'] train_numeric_X = train[numeric_features_X] train_numeric_Y = train[numeric_features_Y]",No,5,21.0 test['province_encoded'] = test['Province_State'].apply(lambda x: province_encoded[x] if x in province_encoded else max(province_encoded.values())+1),No,5,8.0 test['country_encoded'] = test['Country_Region'].apply(lambda x: country_encoded[x] if x in country_encoded else max(country_encoded.values())+1),No,5,8.0 "test['Mon'] = test['Date'].apply(lambda x: int(x.split('-')[1])) test['Day'] = test['Date'].apply(lambda x: int(x.split('-')[2]))",No,3,16.0 "test['serial'] = test['Mon'] * 30 + test['Day'] test['serial'] = test['serial'] - test['serial'].min()",No,4,8.0 "test = pd.merge(test, population2020, how='left', left_on = 'Country_Region', right_on = 'name') test = pd.merge(test, gdp2020, how='left', left_on = 'Country_Region', right_on = 'country')",No,5,32.0 "test_numeric_X = test[numeric_features_X] test_numeric_X.isnull().sum()",No,4,21.0 test_numeric_X = test_numeric_X.fillna(-1),No,5,17.0 "from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression",No,5,22.0 "# Random Forest Model Feature Importance rf_feature_importance = pd.Series(index = X_train.columns, data = np.abs(rf_model.feature_importances_)) n_features = (rf_feature_importance>0).sum() print(f'{n_features} features with reduction of {(1-n_features/len(rf_feature_importance))*100:2.2f}%') rf_feature_importance.sort_values().plot(kind = 'bar', figsize = (13,5));",No,2,79.0 "import lightgbm as lgbm params_lgbm = { 'learning_rate': [.01, .1, .5, .7, .9, .95, .99, 1], 'boosting': ['gbdt'], 'metric': ['l1'], 'feature_fraction': [.3, .4, .5, 1], 'num_leaves': [20], 'min_data': [10], 'max_depth': [10], 'n_estimators': [10, 30, 50, 100] } lgb = lgbm.LGBMRegressor() lgb_regressor = GridSearchCV(lgb, params_lgbm, scoring='neg_root_mean_squared_error', cv = 10, n_jobs = -1) lgb_regressor.fit(X_train, y_train) print(f'Optimal lr: {lgb_regressor.best_params_[""learning_rate""]}') print(f'Optimal feature_fraction: {lgb_regressor.best_params_[""feature_fraction""]}') print(f'Optimal n_estimators: {lgb_regressor.best_params_[""n_estimators""]}') print(f'Best score: {lgb_regressor.best_score_}')'",No,5,6.0 "lgb_model = lgbm.LGBMRegressor(learning_rate=lgb_regressor.best_params_[""learning_rate""], boosting='gbdt', metric='l1', feature_fraction=lgb_regressor.best_params_[""feature_fraction""], num_leaves=20, min_data=10, max_depth=10, n_estimators=lgb_regressor.best_params_[""n_estimators""], n_jobs=-1) lgb_model.fit(X_train, y_train) y_train_pred = lgb_model.predict(X_train) y_pred = lgb_model.predict(X_test) print('Train r2 score: ', r2_score(y_train_pred, y_train)) print('Test r2 score: ', r2_score(y_test, y_pred)) train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train)) test_rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print(f'Train RMSE: {train_rmse:.4f}') print(f'Test RMSE: {test_rmse:.4f}')'",No,3,7.0 "# LightGBM Feature Importance lgb_feature_importance = pd.Series(index = X_train.columns, data = np.abs(lgb_model.feature_importances_)) n_features = (lgb_feature_importance>0).sum() print(f'{n_features} features with reduction of {(1-n_features/len(lgb_feature_importance))*100:2.2f}%') lgb_feature_importance.sort_values().plot(kind = 'bar', figsize = (13,5));",No,5,79.0 "params_xgb = { 'learning_rate': [.1, .5, .7, .9, .95, .99, 1], 'colsample_bytree': [.3, .4, .5, .6], 'max_depth': [4], 'alpha': [3], 'subsample': [.5], 'n_estimators': [30, 70, 100, 200] } xgb_model = XGBRegressor() xgb_regressor = GridSearchCV(xgb_model, params_xgb, scoring='neg_root_mean_squared_error', cv = 10, n_jobs = -1) xgb_regressor.fit(X_train, y_train) print(f'Optimal lr: {xgb_regressor.best_params_[""learning_rate""]}') print(f'Optimal colsample_bytree: {xgb_regressor.best_params_[""colsample_bytree""]}') print(f'Optimal n_estimators: {xgb_regressor.best_params_[""n_estimators""]}') print(f'Best score: {xgb_regressor.best_score_}')'",No,5,6.0 "xgb_model = XGBRegressor(learning_rate=xgb_regressor.best_params_[""learning_rate""], colsample_bytree=xgb_regressor.best_params_[""colsample_bytree""], max_depth=4, alpha=3, subsample=.5, n_estimators=xgb_regressor.best_params_[""n_estimators""], n_jobs=-1) xgb_model.fit(X_train, y_train) y_train_pred = xgb_model.predict(X_train) y_pred = xgb_model.predict(X_test) print('Train r2 score: ', r2_score(y_train_pred, y_train)) print('Test r2 score: ', r2_score(y_test, y_pred)) train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train)) test_rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print(f'Train RMSE: {train_rmse:.4f}') print(f'Test RMSE: {test_rmse:.4f}')'",No,3,28.0 "# XGB with early stopping xgb_model.fit(X_train, y_train, early_stopping_rounds=4, eval_set=[(X_test, y_test)], verbose=False) y_train_pred = xgb_model.predict(X_train) y_pred = xgb_model.predict(X_test) print('Train r2 score: ', r2_score(y_train_pred, y_train)) print('Test r2 score: ', r2_score(y_test, y_pred)) train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train)) test_rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print(f'Train RMSE: {train_rmse:.4f}') print(f'Test RMSE: {test_rmse:.4f}')",No,3,28.0 "# XGB Feature Importance, relevant features can be selected based on its score feature_important = xgb_model.get_booster().get_fscore() keys = list(feature_important.keys()) values = list(feature_important.values()) data = pd.DataFrame(data=values, index=keys, columns=['score']).sort_values(by = 'score', ascending=True) data.plot(kind='bar', figsize = (13,5)) plt.show()",No,3,79.0 "rf_model_en = RandomForestRegressor(max_depth=200, max_features=0.4, min_samples_leaf=3, min_samples_split=6, n_estimators=30, n_jobs=-1, oob_score=True) rf_model_en.fit(X_train, y_train) y_train_pred = rf_model_en.predict(X_train) y_pred = rf_model_en.predict(X_test) print('Train r2 score: ', r2_score(y_train_pred, y_train)) print('Test r2 score: ', r2_score(y_test, y_pred)) train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train)) test_rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print(f'Train RMSE: {train_rmse:.4f}') print(f'Test RMSE: {test_rmse:.4f}')",No,4,28.0 "from numpy import mean from numpy import std from sklearn.datasets import make_regression from sklearn.model_selection import cross_val_score from sklearn.model_selection import RepeatedKFold from sklearn.linear_model import LinearRegression from sklearn.ensemble import StackingRegressor from matplotlib import pyplot # get a stacking ensemble of models def get_stacking(): # define the base models base_models = list() base_models.append(('ridge', ridge_model)) base_models.append(('lasso', lasso_model)) base_models.append(('rf', rf_model_en)) # define meta learner model learner = LinearRegression() # define the stacking ensemble model = StackingRegressor(estimators=base_models, final_estimator=learner, cv=10) return model # get a list of models to evaluate def get_models(): models = dict() models['ridge'] = ridge_model models['lasso'] = lasso_model models['rf_en'] = rf_model_en models['stacking'] = get_stacking() return models # evaluate a given model using cross-validation def evaluate_model(model, X, y): cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=19) scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise') return scores # get the models to evaluate models = get_models() # evaluate the models and store results results, names = list(), list() for name, model in models.items(): scores = evaluate_model(model, X_train, y_train) results.append(scores) names.append(name) print(f'{name} {mean(scores):.3f} {std(scores):.3f}') # plot model performance for comparison pyplot.boxplot(results, labels=names, showmeans=True) pyplot.show()",No,2,4.0 "# define the base models base_models = list() base_models.append(('ridge', ridge_model)) base_models.append(('lasso', lasso_model)) base_models.append(('rf', rf_model_en)) # define meta learner model learner = LinearRegression() # define the stacking ensemble stack1 = StackingRegressor(estimators=base_models, final_estimator=learner, cv=10) # fit the model on all available data stack1.fit(X, y)",No,5,82.0 "pivot=pd.pivot_table(train_df,columns='Country_Region',index='Date',values='ConfirmedCases',aggfunc=np.sum) pivot_fatality=pd.pivot_table(train_df,columns='Country_Region',index='Date',values='Fatalities',aggfunc=np.sum) country_list=[] value_list=[] fatality_list=[] for country in list(pivot.columns): country_list.append(country) value_list.append(pivot[country].max()) fatality_list.append(pivot_fatality[country].max()) new_dict={'Country':country_list,'Confirmed':value_list,'Fatality':fatality_list} df=pd.DataFrame.from_dict(new_dict) df.set_index('Country',inplace=True) plt.figure(figsize=(12,8)) plt.subplot(2,1,1) df['Confirmed'].sort_values(ascending=False)[:10].plot(kind='bar',color='blue') plt.title('Top 10 Countries by Confirmed Cases') plt.subplot(2,1,2) df['Fatality'].sort_values(ascending=False)[:10].plot(kind='bar',color='red') plt.title('Top 10 Countries with Fatalities due to Covid-19') plt.tight_layout()",No,5,33.0 "top_confirmed=df.sort_values(by='Confirmed',ascending=False)[:10]",No,5,9.0 "times_series_cntr = train_df.groupby(['Date','Country_Region'])['ConfirmedCases'].sum()\\ .reset_index().set_index('Date') df_countries_tm = times_series_cntr[times_series_cntr['Country_Region'].isin(list_countries)] plt.figure(figsize=(16,12)) ax = sns.lineplot(x=df_countries_tm.index, y=""ConfirmedCases"", hue=""Country_Region"", data=df_countries_tm,palette='muted').set_title('Cumulative line') plt.legend(loc=2, prop={'size': 12}) plt.title('Cumulative trend plot for Confirmed Cases') plt.xticks(rotation=90);'",No,5,81.0 "from sklearn import linear_model import numpy from sklearn.ensemble import RandomForestRegressor cls = RandomForestRegressor(n_estimators=100) cls.fit(X_train, Y_train) pred = cls.predict(X_test) pred = numpy.exp(pred) cls.score(X_train, Y_train)",No,2,7.0 "output = pd.DataFrame({'Id': test['Id'], 'Prediction': pred}) output.to_csv('submission.csv', index=False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os #for dirname, _, filenames in os.walk('/kaggle/input'): #for filename in filenames: #print(os.path.join(dirname, filename)) # Any results you write to the current directory are saved as output.",No,5,22.0 "#dependencies import pandas as pd import keras import tensorflow as tf import matplotlib.pyplot as plt from matplotlib import style from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from sklearn import preprocessing, svm from keras.models import Sequential from keras.layers import Dense, Dropout, Activation, Flatten from keras.layers import Embedding import math from keras import metrics import seaborn as sns from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from keras.models import Sequential from keras.layers import Dense, LSTM import matplotlib.pyplot as plt plt.style.use('fivethirtyeight') import datetime as dt from sklearn.ensemble import RandomForestRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.model_selection import TimeSeriesSplit from sklearn.svm import SVR from xgboost import XGBRegressor",No,5,23.0 "df_train = pd.read_csv('../input/covid19-global-forecasting-week-2/train.csv', index_col=0)",No,5,45.0 "#df_train['Fatalities'].plt.show() df_train.drop(columns=['Province_State'], inplace=True)",No,4,10.0 "df_train.fillna(0, inplace=True) #df_train.set_index('Date', inplace=True)",No,5,17.0 "testData = pd.read_csv(""../input/test.csv"") submission = pd.DataFrame({ ""Id"": testData[""Id""], ""Prediction"": pred }) submission.to_csv('RandomForestSimple.csv',header=True, index=False)'",No,4,45.0 "le = preprocessing.LabelEncoder() df_train['Country_Region'] = le.fit_transform(df_train['Country_Region']) df_train['Date'] = le.fit_transform(df_train['Date']) df_train",No,5,20.0 "X = df_train.drop(columns=['Fatalities','ConfirmedCases']) scaler = MinMaxScaler(feature_range=(0,1)) scaled_data = scaler.fit_transform(X) #t_scaled_data = preprocessing.scale(X) X= np.array(X) X = preprocessing.scale(X)",No,3,10.0 "scaler.scale_ scale=1/ 1.51515152e-02",No,5,53.0 "# Import necessary libraries import datetime import numpy as np import pandas as pd import matplotlib as plt import warnings warnings.filterwarnings('ignore') # Import ML libraries from sklearn import preprocessing from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from xgboost import XGBClassifier, XGBRegressor from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import GridSearchCV from sklearn.metrics import mean_absolute_error from sklearn.metrics import accuracy_score",No,5,23.0 "y = df_train.drop(columns=['Date','Country_Region','Fatalities']) scaler = MinMaxScaler(feature_range=(0,1)) scaled_data = scaler.fit_transform(y) y = np.array(y) y = preprocessing.scale(y)",No,2,10.0 "# Load and read files submission_example = pd.read_csv(""../input/covid19-global-forecasting-week-2/submission.csv"") train_df = pd.read_csv('../input/covid19-global-forecasting-week-2/train.csv') test_df = pd.read_csv('../input/covid19-global-forecasting-week-2/test.csv') # Rename columns train_df.rename(columns={'Country_Region': 'Country'}, inplace=True) train_df.rename(columns={'Province_State': 'State'}, inplace=True) test_df.rename(columns={'Country_Region': 'Country'}, inplace=True) test_df.rename(columns={'Province_State': 'State'}, inplace=True) display(train_df.head(5)) display(test_df.head(5)) train_df.info() print('\ ') test_df.info()'",No,3,45.0 "# Transform the normal date to pandas datetime train_df['Date'] = pd.to_datetime(train_df['Date']) test_df['Date'] = pd.to_datetime(test_df['Date']) display(train_df.head(5)) display(test_df.head(5))",No,4,16.0 19698-15523,No,5,53.0 X.shape,No,5,58.0 "X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)",No,5,13.0 X_train.shape[1],No,5,58.0 "#X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1)) #X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1)) #X_train.shape",No,1,53.0 model = XGBRegressor(n_estimators=1000),No,5,4.0 "model.fit(X_train, y_train) #,batch_size = 50, epochs= 20)",No,5,7.0 "# Use the forest's predict method on the test data prediction_s = model.predict(X_test) # Calculate the absolute errors errors_s = abs(prediction_s - y_test) # Print out the mean absolute error (mae) print('Mean Absolute Error:', round(np.mean(errors_s), 2), 'degrees.')",No,4,27.0 "accuracy = model.score(X_test, y_test) #test Accuracy squared error for linreg",No,5,49.0 "# Shape of training data print(train_df.shape) # Number of missing values in each column of training data missing_train_count_col = (train_df.isnull().sum()) print(missing_train_count_col[missing_train_count_col>0]) # Shape of testing data print(test_df.shape) # Number of missing values in each column of training data missing_test_count_col = (test_df.isnull().sum()) print(missing_test_count_col[missing_test_count_col>0])",No,4,39.0 "# define the base models base_model = list() base_model.append(('rf1', rf_model)) base_model.append(('rf2', rf_model_en)) base_model.append(('rf3', RandomForestRegressor(max_depth=8, max_features=0.1, min_samples_leaf=3, min_samples_split=2, n_estimators=250, n_jobs=-1, oob_score=False))) # define meta learner model learner = LinearRegression() # define the stacking ensemble stack2 = StackingRegressor(estimators=base_model, final_estimator=learner, cv=10) # fit the model on all available data stack2.fit(X, y)",No,4,7.0 "df_t = pd.read_csv('../input/covid19-global-forecasting-week-2/test.csv',index_col=0)",No,5,45.0 "df_t.drop(columns=['Province_State'], inplace=True)",No,5,10.0 df_t,No,5,41.0 "df_t['Country_Region'] = le.fit_transform(df_t['Country_Region']) df_t['Date'] = le.fit_transform(df_t['Date'])",No,5,20.0 "submission = pd.DataFrame(columns=['Id','Prediction']) submission['Id'] = test_df['Id'] ridge_pred = ridge_model.predict(test_df.drop('Id', axis=1)) submission['Prediction'] = np.expm1(ridge_pred) submission.to_csv('submission_ridge.csv',index=False) lasso_pred = lasso_model.predict(test_df.drop('Id', axis=1)) submission['Prediction'] = np.expm1(lasso_pred) submission.to_csv('submission_lasso.csv',index=False) elastic_pred = el_model.predict(test_df.drop('Id', axis=1)) submission['Prediction'] = np.expm1(elastic_pred) submission.to_csv('submission_elastic.csv',index=False) knn_pred = knn_model.predict(test_df.drop('Id', axis=1)) submission['Prediction'] = np.expm1(knn_pred) submission.to_csv('submission_knn.csv',index=False) rf_pred = rf_model.predict(test_df.drop('Id', axis=1)) submission['Prediction'] = np.expm1(rf_pred) submission.to_csv('submission_rf.csv',index=False) lgb_pred = lgb_model.predict(test_df.drop('Id', axis=1)) submission['Prediction'] = np.expm1(lgb_pred) submission.to_csv('submission_lgb.csv',index=False) xgb_pred = xgb_model.predict(test_df.drop('Id', axis=1)) submission['Prediction'] = np.expm1(xgb_pred) submission.to_csv('submission_xgb.csv',index=False) stack_pred1 = stack1.predict(test_df.drop('Id', axis=1)) submission['Prediction'] = np.expm1(stack_pred1) submission.to_csv('submission_stack1.csv',index=False) stack_pred2 = stack2.predict(test_df.drop('Id', axis=1)) submission['Prediction'] = np.expm1(stack_pred2) submission.to_csv('submission_stack2.csv',index=False)",Yes,4,25.0 "#df_t = np.array(scaled_data) #t_scaled_data = scaler.fit_transform(df_t) #t_scaled_data = preprocessing.scale(df_t)",No,1,53.0 "#Libraries to import import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import datetime as dt import pycountry import plotly_express as px sns.set_style('darkgrid') %matplotlib inline import warnings warnings.filterwarnings('ignore') from sklearn.preprocessing import OrdinalEncoder from sklearn import metrics import xgboost as xgb from xgboost import XGBRegressor from xgboost import plot_importance, plot_tree",No,4,22.0 "df_train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv') df_test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv')",No,5,45.0 "test_predictions = model.predict(subt_data) test_predictions.shape",No,4,27.0 "#test_predictions = test_predictions.reshape(12642,) #test_predictions = test_predictions.reshape(-1, 3)",No,1,53.0 #test_predictions = scaler.inverse_transform(test_predictions),No,1,53.0 "display(df_train.head()) display(df_train.describe()) display(df_train.info())",No,3,40.0 "#INVERSE TRANSFORM #test_predictions = test_predictions.reshape(12642,) #test_predictions_c = test_predictions* scale",No,1,53.0 "df_train['Date'] = pd.to_datetime(df_train['Date'], format = '%Y-%m-%d') df_test['Date'] = pd.to_datetime(df_test['Date'], format = '%Y-%m-%d')",No,5,16.0 "#test_predictions = test_predictions.reshape(12642,) test_predictions = test_predictions_c",No,4,12.0 "print('Minimum date from training set: {}'.format(df_train['Date'].min())) print('Maximum date from training set: {}'.format(df_train['Date'].max()))",No,5,40.0 "b""# Fill null values\ntrain_df['State'].fillna('No State', inplace=True)\ntest_df['State'].fillna('No State', inplace=True)\n\n# Number of missing values in each column of training data\nmissing_train_count_col = (train_df.isnull().sum())\nprint(missing_train_count_col[missing_train_count_col>0])\n\n# Number of missing values in each column of training data\nmissing_test_count_col = (test_df.isnull().sum())\nprint(missing_test_count_col[missing_test_count_col>0])\nprint('\\n')\n\n# Double check no remaining missing values\ntrain_df.info()\nprint('\\n')\ntest_df.info()""",No,3,17.0 "b""# Apply Label Encoding to train and test data\ntrain_df_encoded = train_df.copy()\ntest_df_encoded = test_df.copy()\n\n# Initialize Label encoder\nle = LabelEncoder()\n\n# Create date time features\ndef create_time_features(df):\n df['date'] = df['Date']\n df['hour'] = df['date'].dt.hour\n df['dayofweek'] = df['date'].dt.dayofweek\n df['quarter'] = df['date'].dt.quarter\n df['month'] = df['date'].dt.month\n df['year'] = df['date'].dt.year\n df['dayofyear'] = df['date'].dt.dayofyear\n df['dayofmonth'] = df['date'].dt.day\n df['weekofyear'] = df['date'].dt.weekofyear\n \n return df\n\ntrain_df_encoded = create_time_features(train_df_encoded)\ntest_df_encoded = create_time_features(test_df_encoded)\ntrain_df_encoded.State = le.fit_transform(train_df_encoded.State)\ntrain_df_encoded.Country = le.fit_transform(train_df_encoded.Country)\ntest_df_encoded.State = le.fit_transform(test_df_encoded.State)\ntest_df_encoded.Country = le.fit_transform(test_df_encoded.Country)\n\ndisplay(train_df_encoded.tail())\nprint('\\n')\ndisplay(test_df_encoded.tail())""",No,4,8.0 "df_sub = pd.read_csv('../input/covid19-global-forecasting-week-2/submission.csv') df_sub.drop(columns=['Fatalities','ConfirmedCases'], inplace=True) save_file_c = pd.DataFrame(test_predictions_c, columns=[['ConfirmedCases']]) result_c = pd.merge(df_sub, save_file_c,left_index=True, right_index=True) result_c.columns = ['ForecastId','ConfirmedCases']",No,4,12.0 "print('Minimum date from test set: {}'.format(df_test['Date'].min())) print('Maximum date from test set: {}'.format(df_test['Date'].max()))",No,5,40.0 "result_c = pd.merge(df_t,result_c ,on='ForecastId') df_t.drop(columns=['Country_Region'], inplace=True) #result_c.drop(columns=['Country_Region_y','Country_Region_x'], inplace=True",No,4,10.0 "# Specify all features for prediction x_features_drop = ['ConfirmedCases', 'Fatalities', 'Date', 'date'] y_target1 = ['ConfirmedCases'] y_target2 = ['Fatalities'] # Assign features into X, y1, y2 for training and testing X = train_df_encoded.drop(x_features_drop, axis=1) y1 = train_df_encoded[y_target1] y2 = train_df_encoded[y_target2] display(X.head()) display(y1.tail()) display(y2.tail())",No,4,10.0 "df_map = df_train.copy() df_map['Date'] = df_map['Date'].astype(str) df_map = df_map.groupby(['Date','Country_Region'], as_index=False)['ConfirmedCases','Fatalities'].sum()",No,3,60.0 "def get_iso3_util(country_name): try: country = pycountry.countries.get(name=country_name) return country.alpha_3 except: if 'Congo' in country_name: country_name = 'Congo' elif country_name == 'Diamond Princess' or country_name == 'Laos': return country_name elif country_name == 'Korea, South': country_name = 'Korea, Republic of' elif country_name == 'Taiwan*': country_name = 'Taiwan' country = pycountry.countries.search_fuzzy(country_name) return country[0].alpha_3 d = {} def get_iso3(country): if country in d: return d[country] else: d[country] = get_iso3_util(country) df_map['iso_alpha'] = df_map.apply(lambda x: get_iso3(x['Country_Region']), axis=1)",No,5,8.0 "df_map['ln(ConfirmedCases)'] = np.log(df_map.ConfirmedCases + 1) df_map['ln(Fatalities)'] = np.log(df_map.Fatalities + 1)",No,5,8.0 "b""# # Split into validaion and training data on 2 features\nrft1_train_X, rft1_val_X, rft1_train_y, rft1_val_y = train_test_split(X, y1, train_size=0.8, test_size=0.2, random_state=1)\nrft2_train_X, rft2_val_X, rft2_train_y, rft2_val_y = train_test_split(X, y2, train_size=0.8, test_size=0.2, random_state=2)\n\n# Define the models\nmodel_1 = DecisionTreeClassifier(splitter='best', max_features='log2', random_state=42)\nmodel_2 = DecisionTreeClassifier(splitter='random', max_features='log2', random_state=42)\nmodel_3 = DecisionTreeClassifier(splitter='best', max_features='sqrt', random_state=42)\nmodel_4 = DecisionTreeClassifier(splitter='random', max_features='sqrt', random_state=42)\nmodel_5 = DecisionTreeClassifier(splitter='random', max_features='log2', random_state=42)\nmodel_6 = DecisionTreeClassifier(splitter='random', max_features='sqrt', random_state=42)\nmodel_7 = DecisionTreeClassifier(splitter='best', max_features='log2', random_state=42)\nmodel_8 = DecisionTreeClassifier(splitter='best', max_features='sqrt', random_state=42)\n\nrf_models = [model_1, model_2, model_3, model_4, model_5, model_6, model_7, model_8]\n\n# Function for comparing different models\ndef score_model(model, train_X, val_X, train_y, val_y):\n model.fit(train_X, train_y)\n preds = model.predict(val_X)\n #accuracy = accuracy_score(y_v, preds)\n return mean_absolute_error(val_y, preds)\n\n# Evaluate the models for y1:\nfor i in range(0, len(rf_models)):\n mae = score_model(rf_models[i], rft1_train_X, rft1_val_X, rft1_train_y, rft1_val_y)\n print('Model %d MAE y1: %d' % (i+1, mae))\n\nprint('\\n')\n \n# Evaluate the models for y2:\nfor i in range(0, len(rf_models)):\n mae = score_model(rf_models[i], rft2_train_X, rft2_val_X, rft2_train_y, rft2_val_y)\n print('Model %d MAE y2: %d' % (i+1, mae))""",No,5,3.0 "result_c.set_index('ForecastId', inplace=True)",No,5,84.0 "result_c['ConfirmedCases'] = [0 if result_c.loc[i, 'ConfirmedCases'] <= -0 else result_c.loc[i, 'ConfirmedCases'] for i in result_c.index]",No,5,8.0 "#Fatalities X X = df_train.drop(columns=['Fatalities', ]) scaler = MinMaxScaler(feature_range=(0,1)) scaled_data = scaler.fit_transform(X) #t_scaled_data = preprocessing.scale(X) X= np.array(X) X = preprocessing.scale(X)",No,3,10.0 "#Fatalities y y = df_train.drop(columns=['Date','Country_Region','ConfirmedCases']) scaler = MinMaxScaler(feature_range=(0,1)) scaled_data = scaler.fit_transform(y) y = np.array(y) y = preprocessing.scale(y)",No,4,10.0 "# Choose best Random Forest Model for y1 and y2 best_rf_model_y1 = model_2 best_rf_model_y2 = model_2 # Assign features to test data x_test_features_drop = ['Date', 'date'] X_test = test_df_encoded.drop(x_test_features_drop, axis=1) # Predict the best model for y1 and y2 y1_pred = best_rf_model_y1.predict(X_test) y2_pred = best_rf_model_y2.predict(X_test) print(y1_pred[100:150]) print(y2_pred[100:150])",No,4,48.0 "# Save predictions in format used for competition scoring output = pd.DataFrame({'ForecastId': test_df.ForecastId, 'ConfirmedCases': rnd_y1_pred, 'Fatalities': rnd_y2_pred}) output.to_csv('submission.csv', index=False) print(output.tail(10)) print('Submission file successfully saved..')",No,5,25.0 "#X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1)) #X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1)) X_train.shape",No,5,58.0 "px.choropleth(df_map, locations=""iso_alpha"", color=""ln(ConfirmedCases)"", hover_name=""Country_Region"", hover_data=[""ConfirmedCases""] , animation_frame=""Date"", color_continuous_scale=px.colors.sequential.dense, title='Daily Confirmed Cases growth(Logarithmic Scale)')'",No,5,33.0 model1 = XGBRegressor(n_estimators=1000),No,5,4.0 "px.choropleth(df_map, locations=""iso_alpha"", color=""ln(Fatalities)"", hover_name=""Country_Region"", hover_data=[""Fatalities""], animation_frame=""Date"", color_continuous_scale=px.colors.sequential.OrRd, title = 'Daily Deaths growth(Logarithmic Scale)')'",No,4,33.0 "#Compile the model, because this is a binary classification problem, accuracy can be used #model1.compile(optimizer='Adam', loss= 'mean_squared_error')",No,1,53.0 "training = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/train.csv"") testing = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/test.csv"") data_ = pd.read_csv(""/kaggle/input/covid19-demographic-predictors/covid19_by_country.csv"")",No,5,45.0 "model1.fit(X_train, y_train)#, batch_size = 50, epochs= 20)",No,5,7.0 "train_c = result_c train_c = scaler.fit_transform(train_c) train_c= np.array(train_c) train_c = preprocessing.scale(train_c) #train_c = np.reshape(train_c, (train_c.shape[0], train_c.shape[1], 1))",No,4,18.0 "pred_f = model1.predict(train_c) #pred_f = pred_f.reshape(-1,3)",No,5,27.0 "save_file_f = pd.DataFrame(pred_f, columns=['Fatalities']) save_file_f.index += 1 ",No,5,55.0 "#Get the top 10 countries last_date = df_train.Date.max() df_countries = df_train[df_train['Date']==last_date] df_countries = df_countries.groupby('Country_Region', as_index=False)['ConfirmedCases','Fatalities'].sum() df_countries = df_countries.nlargest(10,'ConfirmedCases') #Get the trend for top 10 countries df_trend = df_train.groupby(['Date','Country_Region'], as_index=False)['ConfirmedCases','Fatalities'].sum() df_trend = df_trend.merge(df_countries, on='Country_Region') df_trend.drop(['ConfirmedCases_y','Fatalities_y'],axis=1, inplace=True) df_trend.rename(columns={'Country_Region':'Country', 'ConfirmedCases_x':'Cases', 'Fatalities_x':'Deaths'}, inplace=True) #Add columns for studying logarithmic trends df_trend['ln(Cases)'] = np.log(df_trend['Cases']+1)# Added 1 to remove error due to log(0). df_trend['ln(Deaths)'] = np.log(df_trend['Deaths']+1)",No,4,14.0 "#change the name of the 'country' feature to match 'Country_Region' on the train set data_['Country_Region']= data_.Country data_.drop('Country',axis=1, inplace =True)",No,4,10.0 "px.line(df_trend, x='Date', y='Cases', color='Country', title='COVID19 Cases growth for top 10 worst affected countries')",No,5,75.0 "result = pd.merge(result_c, save_file_f, left_index=True, right_index=True)",No,5,32.0 training.info(),No,5,40.0 "px.line(df_trend, x='Date', y='Deaths', color='Country', title='COVID19 Deaths growth for top 10 worst affected countries')",No,5,75.0 "print(data_.shape) print(training.shape)",No,5,58.0 result,No,5,41.0 "px.line(df_trend, x='Date', y='ln(Cases)', color='Country', title='COVID19 Cases growth for top 10 worst affected countries(Logarithmic Scale)')",No,5,75.0 "#missing values training.isnull().sum()",No,5,39.0 "px.line(df_trend, x='Date', y='ln(Deaths)', color='Country', title='COVID19 Deaths growth for top 10 worst affected countries(Logarithmic Scale)')",No,5,75.0 submission = result,No,5,77.0 "#missing values data_.isnull().sum()",No,5,39.0 "data_['Quarantine_date'] = pd.to_datetime(data_.Quarantine) data_['Restrictions_date'] = pd.to_datetime(data_.Restrictions) data_['Schools_date'] = pd.to_datetime(data_.Schools) data_.drop(['Schools', 'Restrictions', 'Quarantine'], axis =1, inplace = True)",No,4,16.0 "submission['Fatalities'] = [0 if submission.loc[i, 'Fatalities'] < 0 else submission.loc[i, 'Fatalities'] for i in submission.index]",No,5,8.0 training.Date = pd.to_datetime(training.Date),No,5,16.0 submission,No,5,41.0 "training = training.fillna({'Province_State': 'Unknown'}) testing = testing.fillna({'Province_State': 'Unknown'})",No,5,17.0 "df_us = df_train[df_train['Country_Region']=='US'] df_us['Date'] = df_us['Date'].astype(str) df_us['state_code'] = df_us.apply(lambda x: us_state_abbrev.get(x.Province_State,float('nan')), axis=1) df_us['ln(ConfirmedCases)'] = np.log(df_us.ConfirmedCases + 1) df_us['ln(Fatalities)'] = np.log(df_us.Fatalities + 1)",No,4,8.0 " submission.drop(columns=['Country_Region','Date'], inplace=True)",No,5,10.0 data_.info(),No,5,40.0 "px.choropleth(df_us, locationmode=""USA-states"", scope=""usa"", locations=""state_code"", color=""ln(ConfirmedCases)"", hover_name=""Province_State"", hover_data=[""ConfirmedCases""], animation_frame=""Date"", color_continuous_scale=px.colors.sequential.Darkmint, title = 'Daily Cases growth for USA(Logarithmic Scale)')'",No,5,33.0 "px.choropleth(df_us, locationmode=""USA-states"", scope=""usa"", locations=""state_code"", color=""ln(Fatalities)"", hover_name=""Province_State"", hover_data=[""Fatalities""], animation_frame=""Date"", color_continuous_scale=px.colors.sequential.OrRd, title = 'Daily deaths growth for USA(Logarithmic Scale)')'",No,5,33.0 submission['Fatalities'].sum(),No,5,40.0 len(submission),No,5,58.0 submission.to_csv('submission.csv'),No,5,25.0 "df_train.Province_State.fillna('NaN', inplace=True)",No,5,17.0 "df_plot = df_train.groupby(['Date','Country_Region','Province_State'], as_index=False)['ConfirmedCases','Fatalities'].sum()",No,5,60.0 "import pandas as pd import matplotlib.pyplot as plt %matplotlib inline import datetime import numpy as np import random import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.data as tud import torch.optim as optim # set cuda USE_CUDA = torch.cuda.is_available() DEVICE = torch.device('cuda' if USE_CUDA else 'cpu') #set random seed RANDOM_SEED = 10015 random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) if USE_CUDA: torch.cuda.manual_seed(RANDOM_SEED) # set hyper parameters # LSTM NUM_HIDDEN = 256 N_STEP = 66 # data TIME_STEP = 67 NUM_FEATURES = 2 # train BATCH_SIZE = TIME_STEP // N_STEP EPOCHS = 2000 LEARNING_RATE = 0.01 CLIP_VALUE = 1. # predict N_PREFIX = 10 N_PREDICT = 33",No,5,77.0 "df = training.groupby(['Country_Region', 'Date'], as_index=False).sum() df_test = testing.groupby(['Country_Region', 'Date'], as_index=False).sum()",No,5,60.0 df_test[df_test.Country_Region == 'Italy'],No,5,14.0 df[df.Country_Region == 'Italy'],No,5,14.0 len(df.Country_Region.unique()),No,5,54.0 len(df_test.Country_Region.unique()),No,5,54.0 "train = pd.merge(training, data_, on=['Country_Region'], how= 'left') test = pd.merge(testing, data_, on=['Country_Region'], how= 'left')",No,5,32.0 "df = df_plot.query(""Country_Region=='India'"") px.line(df, x='Date', y='ConfirmedCases', title='Daily Cases growth for India')'",No,5,33.0 train.isna().sum(),No,5,39.0 "px.line(df, x='Date', y='Fatalities', title='Daily Deaths growth for India')",No,5,75.0 "data_[data_.Restrictions_date.notnull()][['Country_Region', 'Quarantine_date']]",No,5,14.0 train.loc[(train['Date'] == '2020-03-20') &(train.Country_Region == 'Argentina') ],No,5,14.0 "import pandas as pd import numpy as np",No,5,22.0 "ch_geojson = ""../input/china-regions-map/china-provinces.json"" df_plot['day'] = df_plot.Date.dt.dayofyear df_plot['Province_ch'] = """"'",No,3,8.0 "PATH_WEEK='/kaggle/input/covid19-global-forecasting-week-2' df_train = pd.read_csv(f'{PATH_WEEK}/train.csv') df_test = pd.read_csv(f'{PATH_WEEK}/test.csv') df_hospital_beds = pd.read_csv(r""/kaggle/input/hospital-beds/API_SH.MED.BEDS.ZS_DS2_en_csv_v2_887506.csv"", skiprows=4) df_population2 = pd.read_csv(""/kaggle/input/populationdata/population_by_country_2020.csv"", na_values=""N.A."") df_environment_pm2 = pd.read_csv(""/kaggle/input/environmentpm25/API_EN.ATM.PM25.MC.M3_DS2_en_csv_v2_888986.csv"", skiprows=4) df_train.rename(columns={'Country_Region':'Country'}, inplace=True) df_test.rename(columns={'Country_Region':'Country'}, inplace=True) df_train.rename(columns={'Province_State':'State'}, inplace=True) df_test.rename(columns={'Province_State':'State'}, inplace=True) df_hospital_beds.rename(columns={'Country Name' : 'Country'}, inplace=True) df_environment_pm2.rename(columns={'Country Name' : 'Country'}, inplace=True) df_population2.set_axis([""Country"", ""Population"", ""YearlyChange"", ""NetChange"", ""Density"", ""LandArea"", ""Migrants"", ""FertilityRate"", ""MedAge"", ""UrbanPop"", ""WorldShare""], axis=1, inplace=True) df_train['Date'] = pd.to_datetime(df_train['Date'], infer_datetime_format=True) df_test['Date'] = pd.to_datetime(df_test['Date'], infer_datetime_format=True) df_train.info()'",No,1,45.0 "test['Quarantine'] = 0 test['Schools'] = 0 test['Restrictions'] = 0 test.loc[(test.Country_Region == 'Argentina'), 'Quarantine' ] = 1 test.loc[(test.Country_Region == 'Austria'), 'Quarantine' ] = 1 test.loc[(test.Country_Region == 'Belgium'), 'Quarantine' ] = 1 test.loc[(test.Country_Region == 'China'), 'Quarantine' ] = 1 test.loc[(test.Country_Region == 'Colombia'), 'Quarantine' ] = 1 test.loc[(test.Country_Region == 'Denmark'), 'Quarantine' ] = 1 test.loc[(test.Country_Region == 'France'), 'Quarantine' ] = 1 test.loc[(test.Country_Region == 'Germany'), 'Quarantine' ] = 1 test.loc[(test.Country_Region == 'India'), 'Quarantine' ] = 1 test.loc[(test.Country_Region == 'Israel'), 'Quarantine' ] = 1 test.loc[(test.Country_Region == 'Italy'), 'Quarantine' ] = 1 test.loc[(test.Country_Region == 'Malaysia'), 'Quarantine' ] = 1 test.loc[(test.Country_Region == 'New Zealand'), 'Quarantine' ] = 1 test.loc[(test.Country_Region == 'Peru'), 'Quarantine' ] = 1 test.loc[(test.Country_Region == 'Spain'), 'Quarantine' ] = 1 test.loc[(test.Country_Region == 'Israel'), 'Schools' ] = 1 test.loc[(test.Country_Region == 'Israel'), 'Restrictions' ] = 1 test.drop(['Quarantine_date', 'Schools_date', 'Restrictions_date'], axis = 1, inplace = True)",No,4,8.0 "df = df_plot.query(""Country_Region=='China'"") fig = px.choropleth_mapbox(df, geojson=ch_geojson, #scope=""asia"", color=""ConfirmedCases"", locations=""Province_ch"", featureidkey=""objects.CHN_adm1.geometries.properties.NL_NAME_1"", #featureidkey=""features.properties.name"", animation_frame=""day"") fig.update_geos(fitbounds=""locations"", visible=False) fig.update_layout(margin={""r"":0,""t"":0,""l"":0,""b"":0}) fig.show()'",No,5,33.0 "train['Quarantine'] = 0 train['Schools'] = 0 train['Restrictions'] = 0 train.loc[(train['Date'] >= '2020-03-20') &(train.Country_Region == 'Argentina'), 'Quarantine' ] = 1 train.loc[(train['Date'] >= '2020-03-16') &(train.Country_Region == 'Austria'), 'Quarantine' ] = 1 train.loc[(train['Date'] >= '2020-03-18') &(train.Country_Region == 'Belgium'), 'Quarantine' ] = 1 train.loc[(train['Date'] >= '2020-01-24') &(train.Country_Region == 'China'), 'Quarantine' ] = 1 train.loc[(train['Date'] >= '2020-03-25') &(train.Country_Region == 'Colombia'), 'Quarantine' ] = 1 train.loc[(train['Date'] >= '2020-03-16') &(train.Country_Region == 'Denmark'), 'Quarantine' ] = 1 train.loc[(train['Date'] >= '2020-03-17') &(train.Country_Region == 'France'), 'Quarantine' ] = 1 train.loc[(train['Date'] >= '2020-03-21') &(train.Country_Region == 'Germany'), 'Quarantine' ] = 1 train.loc[(train['Date'] >= '2020-03-23') &(train.Country_Region == 'India'), 'Quarantine' ] = 1 train.loc[(train['Date'] >= '2020-03-19') &(train.Country_Region == 'Israel'), 'Quarantine' ] = 1 train.loc[(train['Date'] >= '2020-03-08') &(train.Country_Region == 'Italy'), 'Quarantine' ] = 1 train.loc[(train['Date'] >= '2020-03-18') &(train.Country_Region == 'Malaysia'), 'Quarantine' ] = 1 train.loc[(train['Date'] >= '2020-03-23') &(train.Country_Region == 'New Zealand'), 'Quarantine' ] = 1 train.loc[(train['Date'] >= '2020-03-15') &(train.Country_Region == 'Peru'), 'Quarantine' ] = 1 train.loc[(train['Date'] >= '2020-03-15') &(train.Country_Region == 'Spain'), 'Quarantine' ] = 1 train.loc[(train['Date'] >= '2020-03-19') &(train.Country_Region == 'Israel'), 'Schools' ] = 1 train.loc[(train['Date'] >= '2020-03-19') &(train.Country_Region == 'Israel'), 'Restrictions' ] = 1 train.drop(['Quarantine_date', 'Schools_date', 'Restrictions_date'], axis = 1, inplace = True)",No,5,8.0 "train[train.Quarantine == 1][['Country_Region', 'Date']].head(50)",No,5,41.0 "df = df_plot.query(""Country_Region=='China'"") px.line(df, x='Date', y='ConfirmedCases', color='Province_State', title='Daily Cases growth for China')'",No,5,33.0 "country_names = {'Bahamas, The': 'Bahamas', 'Brunei Darussalam' : 'Brunei', 'DR Congo' : 'Congo (Kinshasa)', ""Cte d'Ivoire"" : ""Cote d'Ivoire"", 'Congo' : 'Congo (Brazzaville)', 'Congo, Rep.': 'Congo (Brazzaville)', 'Congo, Dem. Rep.': 'Congo (Kinshasa)', 'Czech Republic (Czechia)' : 'Czechia', 'Czech Republic': 'Czechia', 'Diamond Princess': 'Diamond Princess', 'Egypt, Arab Rep.': 'Egypt', 'Gambia, The': 'Gambia', 'Holy See': 'Holy See', 'Iran, Islamic Rep.': 'Iran', 'Korea, Rep.': 'Korea, South', 'South Korea':'Korea, South', 'Kyrgyz Republic': 'Kyrgyzstan', 'Lao PDR': 'Laos', 'Russian Federation': 'Russia', 'St. Kitts and Nevis': 'Saint Kitts and Nevis', 'Saint Kitts & Nevis' : 'Saint Kitts and Nevis', 'St. Lucia': 'Saint Lucia', 'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines', 'St. Vincent & Grenadines':'Saint Vincent and the Grenadines', 'Serbia': 'Serbia', 'Slovak Republic': 'Slovakia', 'Syrian Arab Republic': 'Syria', 'Taiwan': 'Taiwan*', 'United States': 'US', 'Venezuela, RB': 'Venezuela' } df_population2.Country.replace(country_names, inplace=True) df_hospital_beds.Country.replace(country_names, inplace=True) df_environment_pm2.Country.replace(country_names, inplace=True) df_train = pd.merge(df_train, df_population2, on=""Country"", how=""left"") df_test = pd.merge(df_test, df_population2, on=""Country"", how=""left"") df_hospital_beds.rename(columns={'2011':'HospitalBeds'}, inplace=True) df_train = pd.merge(df_train, df_hospital_beds[[""Country"", ""HospitalBeds""]], on=""Country"", how=""left"") df_test = pd.merge(df_test, df_hospital_beds[[""Country"", ""HospitalBeds""]], on=""Country"", how=""left"") df_environment_pm2.rename(columns={'2017':'PM25'}, inplace=True) df_train = pd.merge(df_train, df_environment_pm2[[""Country"", ""PM25""]], on=""Country"", how=""left"") df_test = pd.merge(df_test, df_environment_pm2[[""Country"", ""PM25""]], on=""Country"", how=""left"") '",Yes,1,8.0 "px.line(df, x='Date', y='Fatalities', color='Province_State', title='Daily Deaths growth for China')",No,5,75.0 data_[data_.Quarantine_date.notnull()],No,5,14.0 "df_train['NumDate'] = df_train.Date.astype(int)/((10**9)*60*60*24) first_date = df_train.NumDate.min() df_train.NumDate -= first_date df_train.head() df_test['NumDate'] = df_test.Date.astype(int)/((10**9)*60*60*24) df_test.NumDate -= first_date outbreak_dates = df_train[['Country', 'NumDate']][df_train.ConfirmedCases>0].groupby('Country', as_index=False).min() outbreak_dates.columns = ['Country', 'FirstOutbreak'] first_death = df_train[['Country', 'NumDate']][df_train.Fatalities>0].groupby('Country', as_index=False).min() first_death.columns = ['Country', 'FirstDeath'] df_train = pd.merge(df_train, outbreak_dates, how='left') df_test = pd.merge(df_test, outbreak_dates, how='left') df_train = pd.merge(df_train, first_death, how='left') df_train.FirstDeath.fillna(0,inplace=True) df_test = pd.merge(df_test, first_death, how='left') df_test.FirstDeath.fillna(0,inplace=True) df_train['DaysSinceOutbreak'] = df_train.NumDate - df_train.FirstOutbreak df_test['DaysSinceOutbreak'] = df_test.NumDate - df_test.FirstOutbreak df_train['DaysSinceFirstDeath'] = df_train.NumDate - df_train.FirstDeath df_test['DaysSinceFirstDeath'] = df_test.NumDate - df_test.FirstDeath df_train.head()",Yes,1,8.0 "def categoricalToInteger(df): #convert NaN Province State values to a string df.Province_State.fillna('NaN', inplace=True) #Define Ordinal Encoder Model oe = OrdinalEncoder() df[['Province_State','Country_Region']] = oe.fit_transform(df.iloc[:,1:3]) return df",No,5,20.0 "df_train.WorldShare = df_train.WorldShare.str.rstrip('%').astype('float') / 100.0 df_test.WorldShare = df_test.WorldShare.str.rstrip('%').astype('float') / 100.0 df_train.UrbanPop = df_train.UrbanPop.str.rstrip('%').astype('float') / 100.0 df_test.UrbanPop = df_test.UrbanPop.str.rstrip('%').astype('float') / 100.0 df_train.YearlyChange = df_train.YearlyChange.str.rstrip('%').astype('float') / 100.0 df_test.YearlyChange = df_test.YearlyChange.str.rstrip('%').astype('float') / 100.0",No,5,16.0 "def create_features(df): df['day'] = df['Date'].dt.day df['month'] = df['Date'].dt.month df['dayofweek'] = df['Date'].dt.dayofweek df['dayofyear'] = df['Date'].dt.dayofyear df['quarter'] = df['Date'].dt.quarter df['weekofyear'] = df['Date'].dt.weekofyear return df",No,5,8.0 "from scipy.optimize import curve_fit import matplotlib.pyplot as plt def log_growth(x, a,r,c): return 1.0*c/(1+a * np.exp(-r * x)) def get_log_fit_params(data): try: popt, pcov = curve_fit(log_growth, data.NumDate, data.ConfirmedCases, method=""lm"") return popt except RuntimeError: try: popt, pcov = curve_fit(log_growth, data.NumDate, data.ConfirmedCases, method=""trf"") except RuntimeError: return np.zeros(3) df_popt = df_train.groupby(""Country"").apply(get_log_fit_params) df_opt = df_popt.apply(pd.Series) df_opt.set_axis([""a"", ""r"", ""c""], axis=1, inplace=True) df_opt.head()",No,5,53.0 df_opt[df_opt.a>0],No,5,14.0 "df_train = pd.merge(df_train, df_opt, on=""Country"", how=""left"") df_test = pd.merge(df_test, df_opt, on=""Country"", how=""left"")",No,5,32.0 "df_train['LogPrediction'] = df_train.apply(lambda x : log_growth(x.NumDate, x.a, x.r, x.c), axis=1) df_test['LogPrediction'] = df_test.apply(lambda x : log_growth(x.NumDate, x.a, x.r, x.c), axis=1)",No,5,8.0 train['Quarantine'].any() ==1,No,5,53.0 train[train.Country_Region=='Italy'],No,5,14.0 "def cum_sum(df, date, country, state): sub_df = df[(df['Country_Region']==country) & (df['Province_State']==state) & (df['Date']<=date)] display(sub_df) return sub_df['ConfirmedCases'].sum(), sub_df['Fatalities'].sum()",Yes,3,14.0 "from sklearn.metrics import mean_squared_log_error from sklearn.model_selection import KFold from sklearn.preprocessing import PolynomialFeatures import xgboost as xgb df_train.fillna(0, inplace=True) df_test.fillna(0, inplace=True) features = ['NumDate', ""Population"", ""YearlyChange"", ""NetChange"", ""Density"", ""LandArea"", ""Migrants"", ""FertilityRate"", ""MedAge"", ""UrbanPop"", ""WorldShare"", 'FirstOutbreak', 'DaysSinceOutbreak', 'FirstDeath', 'DaysSinceFirstDeath', 'HospitalBeds', 'PM25', 'LogPrediction', 'a', 'r', 'c'] X = pd.concat([df_train[features], pd.get_dummies(df_train.Country,prefix=""C_""), pd.get_dummies(df_train.State,prefix=""S_"")],axis=1) y1 = df_train.ConfirmedCases y2 = df_train.Fatalities fit1 = xgb.XGBRegressor(n_estimators=5000, random_state = 123).fit(X, y1) fit2 = xgb.XGBRegressor(n_estimators=5000, random_state = 123).fit(X, y2) error1 = np.sqrt(mean_squared_log_error([max(x,0) for x in fit1.predict(X)], y1)) error2 = np.sqrt(mean_squared_log_error([max(x,0) for x in fit2.predict(X)], y2)) print(error1) print(error2) print((error1+error2)/2)'",Yes,1,11.0 "df_out = pd.DataFrame(df_test.ForecastId) X = pd.concat([df_test[features], pd.get_dummies(df_test.Country,prefix=""C_""), pd.get_dummies(df_test.State,prefix=""S_"")],axis=1) df_out['ConfirmedCases'] = [max(x,0) for x in fit1.predict(X)] df_out['Fatalities'] = [max(x,0) for x in fit2.predict(X)] df_out.tail()'",Yes,1,11.0 train[train['Restrictions'] == 1],No,5,14.0 train.columns,No,5,71.0 "def train_dev_split(df): date = df['Date'].max() - dt.timedelta(days=7) return df[df['Date'] <= date], df[df['Date'] > date]",No,5,13.0 "train.hist(figsize=(11,10))",No,5,33.0 "feat_importances = pd.Series(fit1.feature_importances_, index=X.columns) feat_importances.sort_values(ascending=False).head(20)",No,4,9.0 "df_train = categoricalToInteger(df_train) df_train = create_features(df_train)",No,5,8.0 "train.drop(['Tests','Test Pop', 'Density', 'Urban Pop', 'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'Sex Ratio', 'lung', 'Female Lung', 'Male Lung', 'Crime Index', 'Population 2020', 'Smoking 2016', 'Females 2018', 'Total Infected','Total Deaths', 'Total Recovered', 'Hospital Bed', 'Median Age', 'GDP 2018'], axis = 1, inplace = True)",No,5,10.0 "prediction_ww = pd.merge(df_test, df_out, on=""ForecastId"")[[""Date"", ""ConfirmedCases"", ""Fatalities""]].groupby(""Date"").sum() prediction_ww.set_axis([""PredictedCases"", ""PredictedFatalities""], axis=1, inplace=True) prediction_ww.plot()",No,4,11.0 "df_train, df_dev = train_dev_split(df_train)",No,5,13.0 "test.drop(['Tests','Test Pop', 'Density', 'Urban Pop', 'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'Sex Ratio', 'lung', 'Female Lung', 'Male Lung', 'Crime Index', 'Population 2020', 'Smoking 2016', 'Females 2018', 'Total Infected', 'Total Deaths', 'Total Recovered', 'Hospital Bed', 'Median Age', 'GDP 2018'], axis = 1, inplace = True)",No,5,10.0 "train_ww = df_train[[""Date"", ""ConfirmedCases"", ""Fatalities""]].groupby(""Date"").sum() train_ww.plot()",No,4,60.0 "pd.merge(train_ww, prediction_ww, how='outer', on=""Date"").plot()'",No,5,32.0 print(train.describe()),No,5,40.0 print(test.describe()),No,5,40.0 test.isna().sum(),No,5,39.0 "columns = ['day','month','dayofweek','dayofyear','quarter','weekofyear','Province_State', 'Country_Region','ConfirmedCases','Fatalities'] df_train = df_train[columns] df_dev = df_dev[columns]",No,5,10.0 test.Date = pd.to_datetime(test.Date),No,5,16.0 "train = df_train.values dev = df_dev.values X_train, y_train = train[:,:-2], train[:,-2:] X_dev, y_dev = dev[:,:-2], dev[:,-2:]",No,4,21.0 "def create_time_features(df): """""" Creates time series features from datetime index """""" df['date'] = df.index df['dayofweek'] = df['Date'].dt.dayofweek df['quarter'] = df['Date'].dt.quarter df['month'] = df['Date'].dt.month df['dayofyear'] = df['Date'].dt.dayofyear df['dayofmonth'] = df['Date'].dt.day df['weekofyear'] = df['Date'].dt.weekofyear X = df[['dayofweek','quarter','month', 'dayofyear','dayofmonth','weekofyear']] return X'",No,5,8.0 "import pandas as pd #import plotly.express as px #import matplotlib.pyplot as plt #import plotly.graph_objects as go from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression import numpy as np from sklearn.metrics import mean_squared_log_error from sklearn.tree import DecisionTreeRegressor from google.cloud import bigquery from scipy.spatial.distance import cdist from sklearn.preprocessing import LabelEncoder",No,5,22.0 "train_df=pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"") test_df=pd.read_csv(""../input/covid19-global-forecasting-week-2/test.csv"")",No,5,45.0 train.Date = pd.to_datetime(train.Date),No,5,16.0 "create_time_features(train) create_time_features(test)",No,5,8.0 "def modelfit(alg, X_train, y_train,useTrainCV=True, cv_folds=5, early_stopping_rounds=50): if useTrainCV: xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(X_train, label=y_train) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='rmse', early_stopping_rounds=early_stopping_rounds, show_stdv=False) alg.set_params(n_estimators=cvresult.shape[0]) #Fit the algorithm on the data alg.fit(X_train, y_train,eval_metric='rmse') #Predict training set: predictions = alg.predict(X_train) #predprob = alg.predict_proba(X_train)[:,1] #Print model report: print(""\ Model Report"") #print(""Accuracy : %.4g"" % metrics.accuracy_score(y_train, predictions)) print(""RMSE Score (Train): %f"" % metrics.mean_squared_error(y_train, predictions)) feat_imp = pd.Series(alg.feature_importances_).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score')'",Yes,2,7.0 "train.drop([""Id"",""Date"", 'date'], axis=1, inplace=True) test.drop([""Date"", 'date'], axis=1, inplace=True)'",No,5,10.0 "print(""Min train date: "",train_df[""Date""].min()) print(""Max train date: "",train_df[""Date""].max()) print(""Min test date: "",test_df[""Date""].min()) print(""Max test date: "",test_df[""Date""].max())",No,5,40.0 "train_df=train_df[train_df[""Date""]<""2020-03-19""]",No,5,14.0 test_df.isnull().sum(),No,5,39.0 "pop_info = pd.read_csv(""../input/population-by-country-2020/population_by_country_2020.csv"")",No,5,45.0 "b""pop_info.rename(columns={'Density (P/Km)': 'Density'}, inplace=True)""",No,5,61.0 pop_info.columns,No,5,71.0 "country_lookup=pop_info[[""Country (or dependency)"",""Population (2020)"",""Density""]]",No,5,12.0 "pd.DataFrame.from_dict(country_lookup) train_df_pop=pd.merge(train_df, country_lookup, how='left', left_on='Country_Region', right_on='Country (or dependency)')",No,4,11.0 train_df_pop.info(),No,5,40.0 "train_df_pop.loc[train_df_pop[""Country_Region""]==""US"", [""Population (2020)""]]=331002651 #United Sates train_df_pop.loc[train_df_pop[""Country_Region""]==""US"", [""Density""]]=36 train_df_pop.loc[train_df_pop[""Country_Region""]==""Korea, South"", [""Population (2020)""]]=51269185 #South Korea train_df_pop.loc[train_df_pop[""Country_Region""]==""Korea, South"", [""Density""]]=527 train_df_pop.loc[train_df_pop[""Country_Region""]==""Czechia"", [""Population (2020)""]]=10708981 #Czech Republic train_df_pop.loc[train_df_pop[""Country_Region""]==""Czechia"", [""Density""]]=139 train_df_pop.loc[train_df_pop[""Country_Region""]==""Taiwan*"", [""Population (2020)""]]=23816775 #Taiwan train_df_pop.loc[train_df_pop[""Country_Region""]==""Taiwan*"", [""Density""]]=673 train_df_pop.loc[train_df_pop[""Country_Region""]==""Congo (Kinshasa)"", [""Population (2020)""]]=89561403 #DR Congo train_df_pop.loc[train_df_pop[""Country_Region""]==""Congo (Kinshasa)"", [""Density""]]=40 train_df_pop.loc[train_df_pop[""Country_Region""]==""Congo (Brazzaville)"", [""Population (2020)""]]=5518087 #Congo train_df_pop.loc[train_df_pop[""Country_Region""]==""Congo (Brazzaville)"", [""Density""]]=16 train_df_pop.loc[train_df_pop[""Country_Region""]==""Cote d'Ivoire"", [""Population (2020)""]]=26378274 #Cte d'Ivoire train_df_pop.loc[train_df_pop[""Country_Region""]==""Cote d'Ivoire"", [""Density""]]=83 train_df_pop.loc[train_df_pop[""Country_Region""]==""Saint Kitts and Nevis"", [""Population (2020)""]]=53199 #Saint Kitts & Nevis train_df_pop.loc[train_df_pop[""Country_Region""]==""Saint Kitts and Nevis"", [""Density""]]=205 train_df_pop.loc[train_df_pop[""Country_Region""]==""Saint Vincent and the Grenadines"", [""Population (2020)""]]=110940 #St. Vincent & Grenadines train_df_pop.loc[train_df_pop[""Country_Region""]==""Saint Vincent and the Grenadines"", [""Density""]]=284 train_df_pop.loc[train_df_pop[""Country_Region""]==""Diamond Princess"", [""Population (2020)""]]=3770 #Population and density are same since it is a cruise ship train_df_pop.loc[train_df_pop[""Country_Region""]==""Diamond Princess"", [""Density""]]=3770'",No,5,8.0 "model1 = XGBRegressor(n_estimators=1000) model2 = XGBRegressor(n_estimators=1000)",No,5,4.0 "model1.fit(X_train, y_train[:,0], eval_set=[(X_train, y_train[:,0]), (X_dev, y_dev[:,0])], verbose=False)",No,5,7.0 "model2.fit(X_train, y_train[:,1], eval_set=[(X_train, y_train[:,1]), (X_dev, y_dev[:,1])], verbose=False)",No,5,7.0 plot_importance(model1);,No,3,79.0 plot_importance(model2);,No,3,79.0 "df_train = categoricalToInteger(df_test) df_train = create_features(df_test)",No,3,8.0 "columns = ['day','month','dayofweek','dayofyear','quarter','weekofyear','Province_State', 'Country_Region'] df_test = df_test[columns]",No,5,10.0 "y_pred1 = model1.predict(df_test.values) y_pred2 = model2.predict(df_test.values)",No,5,48.0 df_submit = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv'),No,5,45.0 "df_submit.ConfirmedCases = y_pred1 df_submit.Fatalities = y_pred2",No,5,55.0 "df_submit.to_csv(r'submission.csv', index=False)",No,5,25.0 "# Installing the required libs !pip install -q fastprogress fastai2 fastcore fast_tabnet --upgrade ",No,5,87.0 "from fastai2.basics import * from fastai2.tabular.all import * from fast_tabnet.core import *",No,5,22.0 "import sys sys.path.insert(0, ""../input/covid19-global-forecasting-week-2/"") import warnings warnings.filterwarnings(action='once')'",No,3,23.0 df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv') ,No,5,45.0 "df_test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv') df_test.head()",Yes,4,45.0 "df['key'] = df['Country_Region'] + '#' + df['Province_State'].fillna('') df['province_flag'] = np.where(df['Province_State'].isnull(),0,1) df['Province_State'] = df['Province_State'].fillna(df['Country_Region'])",Yes,3,8.0 "df_test['key'] = df_test['Country_Region'] + '#' + df_test['Province_State'].fillna('') df_test['province_flag'] = np.where(df_test['Province_State'].isnull(),0,1) df_test['Province_State'] = df_test['Province_State'].fillna(df_test['Country_Region'])",Yes,5,8.0 df.head(600),No,5,41.0 df_test.head(600),No,5,41.0 "#firstconfirmed = df[(df['ConfirmedCases']>0) & (df['Date']<'2020-03-19')].groupby(['Province_State','Country_Region'])['Date'].min().reset_index() firstconfirmed = df[(df['ConfirmedCases']>0)].groupby(['Province_State','Country_Region'])['Date'].min().reset_index()",No,5,60.0 firstconfirmed.head(),No,5,41.0 firstconfirmed.shape,No,5,58.0 "firstfatality = df[(df['Fatalities']>0)].groupby(['Province_State','Country_Region'])['Date'].min().reset_index() #firstfatality = df[(df['Fatalities']>0) & (df['Date']<'2020-03-19')].groupby(['Province_State','Country_Region'])['Date'].min().reset_index()",No,4,60.0 firstfatality.head(),No,5,41.0 firstfatality.shape,No,5,58.0 "firstconfirmed.columns = ['Province_State','Country_Region','FirstCaseDate'] firstfatality.columns = ['Province_State','Country_Region','FirstFatalityDate']",No,5,61.0 "df = df.merge(firstconfirmed, left_on=['Province_State','Country_Region'],right_on=['Province_State','Country_Region'],how='left') df = df.merge(firstfatality, left_on=['Province_State','Country_Region'],right_on=['Province_State','Country_Region'],how='left')",No,5,32.0 df.head(40),No,5,41.0 "df_test = df_test.merge(firstconfirmed, left_on=['Province_State','Country_Region'],right_on=['Province_State','Country_Region'],how='left') df_test = df_test.merge(firstfatality, left_on=['Province_State','Country_Region'],right_on=['Province_State','Country_Region'],how='left')",No,5,32.0 df_test.head(40),No,5,41.0 "df['Date']=pd.to_datetime(df['Date'], infer_datetime_format=True) df['FirstCaseDate']=pd.to_datetime(df['FirstCaseDate'], infer_datetime_format=True) df['FirstFatalityDate']=pd.to_datetime(df['FirstFatalityDate'], infer_datetime_format=True) ",No,5,16.0 "df_test['Date']=pd.to_datetime(df_test['Date'], infer_datetime_format=True) df_test['FirstCaseDate']=pd.to_datetime(df_test['FirstCaseDate'], infer_datetime_format=True) df_test['FirstFatalityDate']=pd.to_datetime(df_test['FirstFatalityDate'], infer_datetime_format=True) ",No,5,16.0 "df['days_first_case']=(df['Date']-df['FirstCaseDate']).dt.days df['days_first_fatality']=(df['Date']-df['FirstFatalityDate']).dt.days",No,5,8.0 df['days_first_case'],No,5,41.0 "df_test['days_first_case']=(df_test['Date']-df_test['FirstCaseDate']).dt.days df_test['days_first_fatality']=(df_test['Date']-df_test['FirstFatalityDate']).dt.days",No,5,8.0 df_test['days_first_case'],No,3,41.0 "df['days_first_case']=np.where(df['days_first_case']<0,0,df['days_first_case'].fillna(0)) df['days_first_fatality']=np.where(df['days_first_fatality']<0,0,df['days_first_fatality'].fillna(0))",No,5,17.0 "df_test['days_first_case']=np.where(df_test['days_first_case']<0,0,df_test['days_first_case'].fillna(0)) df_test['days_first_fatality']=np.where(df_test['days_first_fatality']<0,0,df_test['days_first_fatality'].fillna(0))",No,5,17.0 df.tail(),No,5,41.0 df[df['Country_Region']=='Brazil'].tail(),No,3,41.0 df_test[df_test['Country_Region']=='Brazil'].tail(),No,4,41.0 "add_datepart(df,'Date',drop=False)",No,5,8.0 "add_datepart(df_test,'Date',drop=False)",No,5,13.0 "external = pd.read_csv('/kaggle/input/covid19-week2-external-data/external_data.csv',sep=';',decimal=',')",No,5,45.0 external.head(),No,5,41.0 "df = df.merge(external, left_on='key',right_on='key',how='left')",No,5,32.0 "df_test = df_test.merge(external, left_on='key',right_on='key',how='left')",No,5,32.0 list(df),No,5,41.0 "df['Confirmedlast43'] = df['ConfirmedCases'].shift(43) df['Fatalitieslast43'] = df['Fatalities'].shift(43)",No,5,8.0 "df['is_valid'] = np.where(df['Date']<'2020-03-29', False, True)",No,5,8.0 df.groupby('is_valid').size(),No,5,60.0 "df['ConfirmedLog'] = np.log(df['ConfirmedCases']+1) df['FatalitiesLog'] = np.log(df['Fatalities']+1)",No,5,8.0 "cat_vars = ['Province_State','Country_Region','province_flag'] cont_vars = ['Elapsed', 'days_first_case', 'days_first_fatality', 'pop_density', 'population', 'area', 'lat_min', 'lat_max', 'lon_min', 'lon_max', 'centroid_x', 'centroid_y', 'wdi_country_population', 'wdi_country_arrivals', 'wdi_arrivals_per_capita', 'wdi_gini', 'wdi_perc_urban_pop', 'wdi_perc_handwashing', 'wdi_uhc_coverage', 'wdi_hospital_beds_p1000', 'wdi_smoke_prevalence', 'wdi_diabetes_prevalence', 'wdi_gdp_per_capita_ppp', 'wdi_perc_death_comm_diseases', 'wdi_perc_death_non_comm_diseases', 'wdi_death_rate_p1000', 'wdi_perc_basic_sanitation', 'wdi_dom_govmt_healt_exped_gdp', 'wdi_dom_govmt_healt_exped_per_cap', 'wdi_perc_females', 'wdi_perc_males', 'wdi_perc_female_20_29', 'wdi_perc_female_30_39', 'wdi_perc_female_40_49', 'wdi_perc_female_50_59', 'wdi_perc_female_60_69', 'wdi_perc_female_70_79', 'wdi_perc_female_80p', 'wdi_perc_male_20_29', 'wdi_perc_male_30_39', 'wdi_perc_male_40_49', 'wdi_perc_male_50_59', 'wdi_perc_male_60_69', 'wdi_perc_male_70_79', 'wdi_perc_male_80p', 'wdi_pop_denisty', 'wdi_perc_anual_growth_pop','Month','Week','Dayofyear']#,'Confirmedlast43','Fatalitieslast43'] dep_vars = ['ConfirmedLog']#,'Fatalities']",No,5,77.0 "procs = [FillMissing, Categorify, Normalize] splits = ColSplitter('is_valid')(df)",No,2,13.0 splits,No,5,41.0 "to = TabularPandas(df, procs, cat_names=cat_vars, cont_names=cont_vars, y_names=dep_vars, y_block=RegressionBlock(), splits=splits)",No,5,12.0 to,No,5,53.0 cats.shape,No,5,58.0 conts.shape,No,5,58.0 y.shape,No,5,58.0 dls.c,No,5,40.0 learn.load('/kaggle/input/covid19-week2-external-data/best18_2day'),No,5,30.0 "preds, y = learn.get_preds()",No,5,48.0 np.exp(preds)-1,No,5,55.0 np.exp(y)-1,No,5,53.0 raw_test_preds = learn.get_preds(dl=dl),No,5,48.0 raw_test_preds[0],No,4,41.0 raw_test_preds[1],No,3,41.0 preds = np.exp(to_np(raw_test_preds[0]))-1,No,5,55.0 preds[:40],No,5,41.0 df['ConfirmedCases'].head(40),No,5,41.0 "preds = pd.DataFrame(preds) preds.columns = ['pred_confirmed']",No,4,12.0 preds.tail(),No,5,41.0 "df = pd.concat([df, preds], axis=1)",No,5,11.0 preds_confirmed_test = np.exp(to_np(raw_test_preds[0]))-1,No,5,55.0 preds_confirmed_test[:40],No,5,41.0 learn.load('/kaggle/input/covid19-week2-external-data/fat_best2_2day'),No,5,30.0 preds,No,5,41.0 y,No,5,53.0 preds_confirmed_test = pd.DataFrame(preds_confirmed_test),No,5,12.0 preds_confirmed_test.columns=['ConfirmedCases'],No,5,61.0 preds_confirmed_test.tail(),No,5,41.0 preds_confirmed_test.shape,No,5,58.0 df_test.shape,No,5,58.0 "df_test = pd.concat([df_test, preds_confirmed_test], axis=1)",No,5,11.0 df_test.tail(40),No,5,41.0 df_test['pred_confirmed']=df_test['ConfirmedCases'],No,5,8.0 preds_fatalities_test = np.exp(to_np(raw_test_preds[0]))-1,No,5,55.0 preds_fatalities_test,No,5,41.0 preds_fatalities_test = pd.DataFrame(preds_fatalities_test),No,5,12.0 preds_fatalities_test.columns=['Fatalities'],No,5,61.0 preds_fatalities_test.tail(),No,5,41.0 preds_fatalities_test.shape,No,5,58.0 "df_test = pd.concat([df_test, preds_fatalities_test], axis=1)",No,5,11.0 df_test[df_test['Country_Region']=='Brazil'].head(20),No,3,41.0 sub_ex = df = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv') ,No,5,45.0 sub_ex.head(),No,5,41.0 "sub = df_test[['ForecastId','ConfirmedCases','Fatalities']]",No,5,55.0 sub.head(),No,5,41.0 sub_ex.tail(),No,5,41.0 sub.tail(),No,5,41.0 "import os, gc, pickle, copy, datetime, warnings import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import lightgbm as lgb from sklearn import metrics pd.set_option('display.max_columns', 100) warnings.filterwarnings('ignore')",No,2,22.0 "df_train = pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"") print(df_train.shape) df_train.head()",Yes,4,45.0 "df_test = pd.read_csv(""../input/covid19-global-forecasting-week-2/test.csv"") print(df_test.shape) df_test.head()",Yes,4,45.0 "# concat train and test df_traintest = pd.concat([df_train, df_test]) print(df_train.shape, df_test.shape, df_traintest.shape)",No,3,11.0 "# process date df_traintest['Date'] = pd.to_datetime(df_traintest['Date']) df_traintest['day'] = df_traintest['Date'].apply(lambda x: x.dayofyear).astype(np.int16) df_traintest.head()",Yes,4,8.0 "# concat Country/Region and Province/State def func(x): try: x_new = x['Country_Region'] + ""/"" + x['Province_State'] except: x_new = x['Country_Region'] return x_new df_traintest['place_id'] = df_traintest.apply(lambda x: func(x), axis=1) df_traintest.head()'",No,4,8.0 df_traintest[(df_traintest['day']>=day_before_public-3) & (df_traintest['place_id']=='China/Hubei')].head(),No,3,41.0 "# concat lat and long df_latlong = pd.read_csv(""../input/smokingstats/df_Latlong.csv"") df_latlong.head()",No,4,45.0 "# concat Country/Region and Province/State def func(x): try: x_new = x['Country/Region'] + ""/"" + x['Province/State'] except: x_new = x['Country/Region'] return x_new df_latlong['place_id'] = df_latlong.apply(lambda x: func(x), axis=1) df_latlong = df_latlong[df_latlong['place_id'].duplicated()==False] df_latlong.head()'",No,5,8.0 "df_traintest = pd.merge(df_traintest, df_latlong[['place_id', 'Lat', 'Long']], on='place_id', how='left') df_traintest.head()",No,5,32.0 "print(pd.isna(df_traintest['Lat']).sum()) # count Nan df_traintest[pd.isna(df_traintest['Lat'])].head()",No,3,39.0 " data = pd.read_csv( '/kaggle/input/restaurant-revenue-prediction/train.csv.zip') test_data = pd.read_csv('/kaggle/input/restaurant-revenue-prediction/test.csv.zip') data.head() ",No,4,45.0 " data.describe() ",No,5,40.0 data.dtypes,No,5,70.0 data['City Group'].unique(),No,5,57.0 data['City'].unique(),No,5,57.0 "#Creating a flag for each type of restaurant data['Type_IL'] = np.where(data['Type'] == 'IL', 1, 0) data['Type_FC'] = np.where(data['Type'] == 'FC', 1, 0) data['Type_DT'] = np.where(data['Type'] == 'DT', 1, 0) #Creating a flag for 'Big Cities' data['Big_Cities'] = np.where(data['City Group'] == 'Big Cities', 1, 0) #Converting Open_Date into day count #Considering the same date the dataset was made available data['Days_Open'] = (pd.to_datetime('2015-03-23') - pd.to_datetime(data['Open Date'])).dt.days #Removing unused columns data = data.drop('Type', axis=1) data = data.drop('City Group', axis=1) data = data.drop('City', axis=1) data = data.drop('Open Date', axis=1) #Adjusting test data as well test_data['Type_IL'] = np.where(test_data['Type'] == 'IL', 1, 0) test_data['Type_FC'] = np.where(test_data['Type'] == 'FC', 1, 0) test_data['Type_DT'] = np.where(test_data['Type'] == 'DT', 1, 0) test_data['Big_Cities'] = np.where(test_data['City Group'] == 'Big Cities', 1, 0) test_data['Days_Open'] = (pd.to_datetime('2015-03-23') - pd.to_datetime(test_data['Open Date'])).dt.days test_data = test_data.drop('Type', axis=1) test_data = test_data.drop('City Group', axis=1) test_data = test_data.drop('City', axis=1) test_data = test_data.drop('Open Date', axis=1)",No,4,8.0 " from sklearn import model_selection from sklearn import linear_model X = data.drop(['Id', 'revenue'], axis=1) Y = data.revenue ",No,5,21.0 " from sklearn.linear_model import Lasso from sklearn.linear_model import Ridge from sklearn import metrics def check_rmse(X, Y, alpha): RMSE_lasso = [] RMSE_ridge = [] for i in alpha: lasso = Lasso(alpha=i) lasso.fit(X, Y) ridge = Ridge(alpha=i) ridge.fit(X, Y) RMSE_lasso.append(metrics.mean_squared_error(Y, lasso.predict(X))) RMSE_ridge.append(metrics.mean_squared_error(Y, ridge.predict(X))) return (RMSE_lasso, RMSE_ridge) ",No,5,84.0 "plt.figure() plt.plot(alpha, RMSE_lasso, 'o-', color=""r"", label=""RMSE_lasso"") plt.legend(loc='best') plt.show()'",No,5,35.0 "lasso = Lasso(alpha=5.5) lasso.fit(X, Y) metrics.mean_squared_error(Y, lasso.predict(X))",No,4,7.0 " model = Lasso(alpha=5.5) model.fit(X, Y) test_predicted = pd.DataFrame() test_predicted['Id'] = test_data.Id test_predicted['Prediction'] = model.predict(test_data.drop('Id', axis=1)) test_predicted.to_csv('submission-lasso-5.5.csv', index=False) test_predicted.describe() ",No,3,28.0 " from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor(n_estimators=150) model.fit(X, Y) test_predicted = pd.DataFrame() test_predicted['Id'] = test_data.Id test_predicted['Prediction'] = model.predict(test_data.drop('Id', axis=1)) test_predicted.to_csv('submission-random-forest.csv', index=False) test_predicted.describe() ",No,3,7.0 " model = Ridge(alpha=330) model.fit(X, Y) test_predicted = pd.DataFrame() test_predicted['Id'] = test_data.Id test_predicted['Prediction'] = model.predict(test_data.drop('Id', axis=1)) test_predicted.to_csv('submission-ridge-330.csv', index=False) test_predicted.describe() ",No,3,7.0 " model = Lasso(alpha=200000) model.fit(X, Y) test_predicted = pd.DataFrame() test_predicted['Id'] = test_data.Id test_predicted['Prediction'] = model.predict(test_data.drop('Id', axis=1)) test_predicted.to_csv('submission-lasso-high-alpha.csv', index=False) test_predicted.describe() ",No,4,7.0 "data['Days_Open'].unique() ",No,5,57.0 " data['Time_Open'] = round(data['Days_Open'] / 700, 0) data = data.drop('Days_Open', axis=1) test_data['Time_Open'] = round(test_data['Days_Open'] / 700, 0) test_data = test_data.drop('Days_Open', axis=1) ",No,4,8.0 " X = data.drop(['Id', 'revenue'], axis=1) Y = data.revenue ",No,5,21.0 "model = Ridge(alpha=330) model.fit(X, Y) test_predicted = pd.DataFrame() test_predicted['Id'] = test_data.Id test_predicted['Prediction'] = model.predict(test_data.drop('Id', axis=1)) test_predicted.to_csv('submission.csv', index=False) test_predicted",No,4,4.0 " model = Lasso(alpha=200000) model.fit(X, Y) test_predicted = pd.DataFrame() test_predicted['Id'] = test_data.Id test_predicted['Prediction'] = model.predict(test_data.drop('Id', axis=1)) test_predicted.to_csv('submission-lasso-.csv', index=False) test_predicted.describe() ",No,4,7.0 "# %% import numpy as np import itertools import plotly.express as px import pandas as pd from plotly.subplots import make_subplots from multiprocessing import Pool class CoronaSim: def __init__(self, grid_size, initial_virus, recover_time, speedreaction, incubation, virulence, contactsize=1, num_cores=4): self.sim_grid = np.zeros(shape=[grid_size, grid_size]) ini_x_virus = np.random.randint( low=0, high=grid_size, size=initial_virus) ini_y_virus = np.random.randint( low=0, high=grid_size, size=initial_virus) self.inistate_matrix = np.zeros(shape=[grid_size, grid_size]) self.inistate_matrix.fill(float(recover_time)) self.recover_time = recover_time self.inistate_matrix[ini_x_virus, ini_y_virus] = 7 self.speedreaction = speedreaction self.incubation = incubation self.samplesize = contactsize self.virulence = virulence self.num_cores = num_cores self.all_sites = list(itertools.product( range(self.sim_grid.shape[0]), range(self.sim_grid.shape[0]))) def mechanismcheck(self): state_value = np.arange(31) valuedf = pd.DataFrame( {'state': state_value, 'Activity': self.activity(state_value)}) f1 = px.scatter(valuedf, x=""state"", y=""Activity"") f1.data[0].update(mode='markers+lines') f1.update_traces(line_color='#B54434', marker_line_width=3, marker_size=4) distance = np.arange(200) disp = np.exp(-self.gm_virulence(20)*distance**2) contactdf = pd.DataFrame({'distance': distance, 'disp': disp}) f2 = px.line(contactdf, x=""distance"", y=""disp"") f2.data[0].update(mode='markers+lines') f2.update_traces(line_color='#1B813E', marker_line_width=3, marker_size=4) infected_num = np.arange(10000) measuredf = pd.DataFrame( {'infected_num': infected_num, 'measure': self.gm_virulence(infected_num)}) f3 = px.line(measuredf, x=""infected_num"", y=""measure"") f3.update_traces(line_color='#66327C', marker_line_width=3, marker_size=4) trace1 = f1['data'][0] trace2 = f2['data'][0] trace3 = f3['data'][0] fig = make_subplots(rows=3, cols=1, shared_xaxes=False, subplot_titles=( ""Figure 1"", ""Figure 2"", ""Figure 3"")) fig.add_trace(trace1, row=1, col=1) fig.add_trace(trace2, row=2, col=1) fig.add_trace(trace3, row=3, col=1) # Update xaxis properties fig.update_xaxes(title_text=""Health state"", row=1, col=1) fig.update_xaxes(title_text=""Distance"", range=[10, 50], row=2, col=1) fig.update_xaxes(title_text=""The number of infected cases"", showgrid=False, row=3, col=1) # Update yaxis properties fig.update_yaxes(title_text=""Willingness"", row=1, col=1) fig.update_yaxes(title_text=""Contact rate"", showgrid=False, row=2, col=1) fig.update_yaxes( title_text=""Intensity of the restriction"", row=3, col=1) # fig['layout'].update(height=800, width=800, showlegend=False) fig.update_layout( xaxis=dict( showline=True, showgrid=False, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', tickfont=dict( family='Arial', size=12, color='rgb(82, 82, 82)', ), ), yaxis=dict( showline=True, showgrid=False, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', tickfont=dict( family='Arial', size=12, color='rgb(82, 82, 82)', ), ), xaxis2=dict( showline=True, showgrid=False, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', tickfont=dict( family='Arial', size=12, color='rgb(82, 82, 82)', ), ), yaxis2=dict( showline=True, showgrid=False, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', tickfont=dict( family='Arial', size=12, color='rgb(82, 82, 82)', ), ), xaxis3=dict( showline=True, showgrid=False, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', tickfont=dict( family='Arial', size=12, color='rgb(82, 82, 82)', ), ), yaxis3=dict( showline=True, showgrid=False, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', tickfont=dict( family='Arial', size=12, color='rgb(82, 82, 82)', ), ), autosize=True, plot_bgcolor='white', height=800, width=800, ) fig.show() def activity(self, state): disp = np.exp((state-self.incubation) ** 2 / self.virulence ** 2) return disp def gm_virulence(self, infected_num): return 100*(2/(1+np.exp(-infected_num*self.speedreaction/(self.sim_grid.shape[0]*self.sim_grid.shape[1])))-1) def spread_prob(self, x_row, y_col, state, seed=1): np.random.seed(seed) distance_sites = np.linalg.norm( np.array(self.all_sites) - np.array([x_row, y_col]), axis=1) Act = self.activity(state) gm_virulence = self.gm_virulence( infected_num=len(np.where(state < self.recover_time)[0])) prob_spread = np.exp(-gm_virulence * distance_sites ** 2) * Act[x_row, y_col] * Act.flatten() prob_spread[x_row*self.sim_grid.shape[1]+y_col] = 0 focal_state = np.random.choice(range( self.sim_grid.shape[0]*self.sim_grid.shape[1]), size=self.samplesize, p=prob_spread/sum(prob_spread)) focal_state_value = 0 if min(state.flatten()[focal_state]) < self.recover_time else self.recover_time return focal_state_value def simspread(self, t_end, savefile): self.savefile = savefile state_matrix = self.inistate_matrix output_list = [] parallel_cores = Pool(self.num_cores) for t in range(t_end): num_infected = len(np.where(state_matrix < self.recover_time)[0]) print( f'At Day {t}, {num_infected} infected cases are confirmed...') healthy_individual_index_row = np.where(state_matrix >= self.recover_time)[0] healthy_individual_index_col = np.where(state_matrix >= self.recover_time)[1] change_state = parallel_cores.starmap(self.spread_prob, zip(healthy_individual_index_row, healthy_individual_index_col, itertools.repeat(state_matrix))) state_matrix[healthy_individual_index_row, healthy_individual_index_col] = change_state state_matrix += 1 output_list.append(state_matrix.tolist()) np.savez(self.savefile, *output_list) return state_matrix if __name__ == ""__main__"": test = CoronaSim(grid_size=100, initial_virus=5, contactsize=2,num_cores=6, recover_time=30, speedreaction=0.01, incubation=10, virulence=25) test.mechanismcheck()'",No,5,53.0 "# Start running simulations result = test.simspread(t_end=10, savefile='test.npz')",No,5,53.0 "# Simulation setup scenario1 = CoronaSim(grid_size=100, initial_virus=5, contactsize=2, num_cores=6, recover_time=30, speedreaction=0.01, incubation=7, virulence=25)",No,5,53.0 "# %% import plotly.graph_objects as go import numpy as np import pandas as pd num_infected = [] Day = [] batch_list = [] for batch in range(1, 4): savefile = f'../input/simulation-scripts/outfile_s{batch}.npz' container = np.load(savefile) sim_result = [container[key] for key in container] for t in range(len(sim_result)): num_infected.append(len(np.where(sim_result[t] < 30)[0])) Day.extend(np.arange(len(sim_result)).tolist()) batch_list.extend(np.repeat(batch, len(sim_result))) infected_growth_df = pd.DataFrame( {'num_infected': num_infected, 'Day': Day, 'batch': batch_list}) # %% # Add data fig = go.Figure() # Create and style traces fig.add_trace(go.Scatter(x=infected_growth_df[infected_growth_df['batch'] == 1].Day, y=infected_growth_df[infected_growth_df['batch'] == 1].num_infected, name='Speed 0.01', line=dict(color='firebrick', width=4))) fig.add_trace(go.Scatter(x=infected_growth_df[infected_growth_df['batch'] == 2].Day, y=infected_growth_df[infected_growth_df['batch'] == 2].num_infected, name='Speed 0.1', line=dict(color='royalblue', width=4, dash='dot'))) fig.add_trace(go.Scatter(x=infected_growth_df[infected_growth_df['batch'] == 3].Day, y=infected_growth_df[infected_growth_df['batch'] == 3].num_infected, name='Speed 1', line=dict(color='green', width=4, dash='dash') # dash options include 'dash', 'dot', and 'dashdot' )) # Edit the layout fig.update_layout(title='The influence of government reaction speed on the pandemic development', xaxis_title='Day', yaxis_title='Number of infected cases', xaxis=dict( showline=True, showgrid=False, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', tickfont=dict( family='Arial', size=12, color='rgb(82, 82, 82)', ), ), yaxis=dict( showline=True, showgrid=False, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', tickfont=dict( family='Arial', size=12, color='rgb(82, 82, 82)', ), ), autosize=True, plot_bgcolor='white', height=600, width=800 ) fig.show() # %% ",Yes,2,33.0 "# %% import plotly.graph_objects as go import numpy as np import pandas as pd num_infected = [] Day = [] batch_list = [] for batch in range(1, 4): savefile = f'../input/simulation-scripts/outfile_s{batch}.npz' container = np.load(savefile) sim_result = [container[key] for key in container] acc_list = [] for t in range(1,len(sim_result)): acc_list.append(len(np.where(sim_result[t] < 30)[0])-len(np.where(sim_result[t-1] < 30)[0])) num_infected.extend(acc_list) Day.extend(np.arange(len(sim_result)-1).tolist()) batch_list.extend(np.repeat(batch, len(sim_result)-1)) infected_growth_df = pd.DataFrame( {'num_infected': num_infected, 'Day': Day, 'batch': batch_list}) # %% # Add data fig = go.Figure() # Create and style traces fig.add_trace(go.Scatter(x=infected_growth_df[infected_growth_df['batch'] == 1].Day, y=infected_growth_df[infected_growth_df['batch'] == 1].num_infected, name='Speed 0.01', line=dict(color='firebrick', width=4),fill='tozeroy')) fig.add_trace(go.Scatter(x=infected_growth_df[infected_growth_df['batch'] == 2].Day, y=infected_growth_df[infected_growth_df['batch'] == 2].num_infected, name='Speed 0.1', line=dict(color='royalblue', width=4, dash='dot'),fill='tozeroy')) fig.add_trace(go.Scatter(x=infected_growth_df[infected_growth_df['batch'] == 3].Day, y=infected_growth_df[infected_growth_df['batch'] == 3].num_infected, name='Speed 1', line=dict(color='green', width=4, dash='dash'), # dash options include 'dash', 'dot', and 'dashdot' fill='tozeroy')) # Edit the layout fig.update_layout(title='', xaxis_title='Day', yaxis_title='Number of newly increase infected cases', xaxis=dict( showline=True, showgrid=False, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', tickfont=dict( family='Arial', size=12, color='rgb(82, 82, 82)', ), ), yaxis=dict( showline=True, showgrid=False, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', tickfont=dict( family='Arial', size=12, color='rgb(82, 82, 82)', ), ), autosize=True, plot_bgcolor='white', height=600, width=800, ) fig.show() # %% ",Yes,5,53.0 "import plotly.express as px import pandas as pd import plotly.graph_objects as go import numpy as np datafile = '../input/covid19-global-forecasting-week-2/train.csv' data = pd.read_csv(datafile) data['PSCR'] = data.Province_State.map(str)+ '' + data.Country_Region.map(str) region = pd.unique(data['PSCR']).tolist() f_region = [] time_list = [] region_name = [] for ci in range(len(region)): region_data = data[data['PSCR'] == region[ci]] region_data = region_data[region_data.ConfirmedCases > 0] inc_percentage = (region_data.ConfirmedCases[1:].to_numpy( )-region_data.ConfirmedCases[:-1].to_numpy())/region_data.ConfirmedCases[:-1].to_numpy() # Only considering the countries with effective data if len(np.where(inc_percentage > 0)[0]) > 0: inc_percentage = inc_percentage[np.where(inc_percentage > 0)[0][0]:] f_region.extend(inc_percentage) time_list.extend([i for i in range(len(inc_percentage))]) region_name.extend([region[ci] for i in range(len(inc_percentage))]) else: pass f_df = pd.DataFrame( {'increase': f_region, 'Day': time_list, 'region': region_name}) fig = px.line(f_df, x='Day', y='increase', color='region') fig.update_layout(title='ip patterns', xaxis_title='Day', yaxis_title='Increasing percentage', xaxis=dict( showline=True, showgrid=False, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', tickfont=dict( family='Arial', size=12, color='rgb(82, 82, 82)', ), ), yaxis=dict( showline=True, showgrid=False, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', tickfont=dict( family='Arial', size=12, color='rgb(82, 82, 82)', ), ), autosize=True, plot_bgcolor='white', height=600, width=800, ) fig.show()",Yes,2,45.0 "import plotly.express as px import pandas as pd import numpy as np datafile = '../input/covid19-global-forecasting-week-2/train.csv' data = pd.read_csv(datafile) # %% all_region_data = data[pd.isna(data['Province_State'])] region = ['Japan', 'Israel'] # region = pd.unique(all_region_data['Country_Region']).tolist() f_region = [] time_list = [] region_name = [] for ci in range(len(region)): region_data = data[data['Country_Region'] == region[ci]] region_data = region_data[region_data.ConfirmedCases > 0] inc_percentage = (region_data.ConfirmedCases[1:].to_numpy( )-region_data.ConfirmedCases[:-1].to_numpy())/region_data.ConfirmedCases[:-1].to_numpy() # Only considering the countries with effective data if len(np.where(inc_percentage > 0)[0]) > 0: inc_percentage = inc_percentage[np.where(inc_percentage > 0)[0][0]:] f_region.extend(inc_percentage) time_list.extend([i for i in range(len(inc_percentage))]) region_name.extend([region[ci] for i in range(len(inc_percentage))]) else: pass f_df = pd.DataFrame( {'increase': f_region, 'Day': time_list, 'region': region_name}) # %% sim_data = [] speed = [0.01,0.1,1] for batch in range(1,4): result = f'../input/simulation-scripts/outfile_s{batch}.npz' container = np.load(result) speed_batch = f'Sim: speed {speed[batch-1]}' sim_result = [container[key] for key in container] num_infected = [] for t in range(len(sim_result)): num_infected.append(len(np.where(sim_result[t] < 30)[0])) inc_infected = [(num_infected[i+1]-num_infected[i])/num_infected[i] for i in range(len(num_infected)-1)] infected_growth_df = pd.DataFrame({'increase': inc_infected, 'Day': [ i for i in range(len(sim_result)-1)], 'region': speed_batch}) sim_data.append(infected_growth_df) sim_df = pd.concat(sim_data) # %% newf = f_df.append(sim_df) # %% fig = px.line(newf, x='Day', y='increase', color='region') fig.update_layout(title='ip patterns of Japan and Israel against 3 simulations', xaxis_title='Day', yaxis_title='Increasing percentage', xaxis=dict( showline=True, showgrid=False, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', tickfont=dict( family='Arial', size=12, color='rgb(82, 82, 82)', ), ), yaxis=dict( showline=True, showgrid=False, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', tickfont=dict( family='Arial', size=12, color='rgb(82, 82, 82)', ), ), autosize=True, plot_bgcolor='white', height=400, width=600, ) fig.show()",Yes,5,53.0 "# %% import numpy as np import plotly.express as px import plotly.graph_objects as go import pandas as pd class plotresult: def __init__(self, savefile): container = np.load(savefile) self.sim_result = [container[key] for key in container] def infectiongrowth(self): num_infected = [] for t in range(len(self.sim_result)): num_infected.append(len(np.where(self.sim_result[t] < 30)[0])) infected_growth_df = pd.DataFrame({'num_infected': num_infected, 'Day': [ i for i in range(len(self.sim_result))]}) fig = go.Figure() fig.add_trace(go.Scatter(x=infected_growth_df.Day, y=infected_growth_df['num_infected'], name=""AAPL High"", line_color='deepskyblue')) fig.update_layout(title_text='Infection growth', xaxis_rangeslider_visible=True) fig.show() def infectionheatmap(self): infect_dis = [] col = [] row = [] days = [] for t in range(len(self.sim_result)): temp_re = self.sim_result[t].tolist() flatten_re = [item for sublist in temp_re for item in sublist] x_co = np.tile(range(len(temp_re)), len(temp_re)) y_co = np.repeat(range(len(temp_re)), len(temp_re)) day_series = np.repeat(t, len(temp_re)**2) infect_dis.extend(flatten_re) col.extend(x_co) row.extend(y_co) days.extend(day_series) heatmapdf = pd.DataFrame( {'dis': infect_dis, 'Day': days, 'col': col, 'row': row}) fig = px.scatter(heatmapdf, x=""col"", y=""row"", color='dis', animation_frame=""Day"", color_continuous_scale=[(0, ""#81C7D4""), (0.2, ""#D0104C""), (1, ""#81C7D4"")]) fig.update_layout(title='The pandemic development', xaxis_title='', yaxis_title='', xaxis=dict( showline=False, showgrid=False, showticklabels=False, ), yaxis=dict( showline=False, showgrid=False, showticklabels=False, ), autosize=True, plot_bgcolor='white', height=600, width=600, coloraxis_colorbar=dict( title=""Healthy state"" ) ) fig.show() # %% if __name__ == ""__main__"": result = '../input/simulation-scripts/outfile_s1.npz' testplot = plotresult(result) # testplot.infectiongrowth() testplot.infectionheatmap() # %% '",Yes,3,22.0 "# get place list places = np.sort(df_traintest['place_id'].unique()) print(len(places))",No,3,57.0 "# calc cases, fatalities per day df_traintest2 = copy.deepcopy(df_traintest) df_traintest2['cases/day'] = 0 df_traintest2['fatal/day'] = 0 tmp_list = np.zeros(len(df_traintest2)) for place in places: tmp = df_traintest2['ConfirmedCases'][df_traintest2['place_id']==place].values tmp[1:] -= tmp[:-1] df_traintest2['cases/day'][df_traintest2['place_id']==place] = tmp tmp = df_traintest2['Fatalities'][df_traintest2['place_id']==place].values tmp[1:] -= tmp[:-1] df_traintest2['fatal/day'][df_traintest2['place_id']==place] = tmp print(df_traintest2.shape) df_traintest2[df_traintest2['place_id']=='China/Hubei'].head()",No,2,12.0 "pipeline = Pipeline([('scaler', StandardScaler()), ('lr', LinearRegression())]) pipeline.fit(train_numeric_X, train_numeric_Y) predicted = pipeline.predict(test_numeric_X)",Yes,3,7.0 from sklearn.svm import SVR,No,5,22.0 "pipeline = Pipeline([('scaler', StandardScaler()), ('estimator', SVR())]) pipeline.fit(train_numeric_X, train_numeric_Y.values[:,0]) pipeline2 = Pipeline([('scaler', StandardScaler()), ('estimator', SVR())]) pipeline2.fit(train_numeric_X, train_numeric_Y.values[:,1]) discovered, fatal = pipeline.predict(test_numeric_X), pipeline2.predict(test_numeric_X)",No,3,7.0 "predicted_x1 = pipeline.predict(train_numeric_X) fig = go.Figure() fig.add_trace(go.Histogram( x=train_numeric_Y['ConfirmedCases'], histnorm='percent', name='actual discovered', # name used in legend and hover labels xbins=dict( # bins used for histogram start=-4.0, end=3.0, size=0.5 ), opacity=0.75 )) fig.add_trace(go.Histogram( x=predicted_x1, histnorm='percent', name='predicted discovered', xbins=dict( start=-3.0, end=4, size=0.5 ), opacity=0.75 )) fig.update_layout( title_text='SVR Histogram of ConfirmedCases', # title of plot xaxis_title_text='bins', # xaxis label yaxis_title_text='ConfirmedCases', # yaxis label bargap=0.2, # gap between bars of adjacent location coordinates bargroupgap=0.1 # gap between bars of the same location coordinates ) fig.show()",No,5,56.0 "predicted_x2 = pipeline2.predict(train_numeric_X) fig = go.Figure() fig.add_trace(go.Histogram( x=train_numeric_Y['Fatalities'], histnorm='percent', name='actual died', # name used in legend and hover labels xbins=dict( # bins used for histogram start=-4.0, end=3.0, size=0.5 ), opacity=0.75 )) fig.add_trace(go.Histogram( x=predicted_x2, histnorm='percent', name='predicted died', xbins=dict( start=-3.0, end=4, size=0.5 ), opacity=0.75 )) fig.update_layout( title_text='SVR Histogram of Fatalities', # title of plot xaxis_title_text='bins', # xaxis label yaxis_title_text='Fatalities', # yaxis label bargap=0.2, # gap between bars of adjacent location coordinates bargroupgap=0.1 # gap between bars of the same location coordinates ) fig.show()",No,5,56.0 "from sklearn.model_selection import KFold kf = KFold(n_splits=10) outcomes = [] fold = 0 for train_index, test_index in kf.split(train_numeric_X): fold += 1 X_train, X_test = train_numeric_X.values[train_index], train_numeric_X.values[test_index] y_train, y_test = train_numeric_Y['ConfirmedCases'].values[train_index], train_numeric_Y['ConfirmedCases'].values[test_index] pipeline.fit(X_train, y_train) predictions = RF_model.predict(X_test) accuracy = accuracy_score(y_test, predictions) outcomes.append(accuracy) print(""Fold {0} accuracy: {1}"".format(fold, accuracy)) mean_outcome = np.mean(outcomes) print(""\ \ Mean Accuracy: {0}"".format(mean_outcome)) '",No,3,7.0 from sklearn.neighbors import KNeighborsClassifier,No,5,22.0 "pipeline = Pipeline([('scaler', StandardScaler()), ('estimator', KNeighborsClassifier(n_jobs=4))]) pipeline.fit(train_numeric_X, train_numeric_Y) predicted_x = pipeline.predict(train_numeric_X)",No,4,7.0 "fig = go.Figure() fig.add_trace(go.Scatter( x=train_numeric_Y['ConfirmedCases'], y=train_numeric_Y['Fatalities'], marker=dict(color=""crimson"", size=12), mode=""markers"", name=""actual"", )) fig.add_trace(go.Scatter( x=predicted_x[:,0], y=predicted_x[:,1], marker=dict(color=""lightseagreen"", size=8), mode=""markers"", name=""predicted"", )) fig.update_layout(title=""RF result"", xaxis_title=""ConfirmedCases"", yaxis_title=""Fatalities"") fig.show()'",No,5,56.0 from sklearn.ensemble import RandomForestClassifier,No,5,22.0 "RF_model = RandomForestClassifier(n_estimators=50, n_jobs=4, max_depth=5) RF_model.fit(train_numeric_X, train_numeric_Y) predicted = RF_model.predict(test_numeric_X)",No,3,7.0 predicted_x = RF_model.predict(train_numeric_X),No,5,27.0 "fig = go.Figure() fig.add_trace(go.Histogram( x=train_numeric_Y['ConfirmedCases'], histnorm='percent', name='actual discovered', # name used in legend and hover labels xbins=dict( # bins used for histogram start=-4.0, end=3.0, size=0.5 ), opacity=0.75 )) fig.add_trace(go.Histogram( x=predicted_x[:,0], histnorm='percent', name='predicted discovered', xbins=dict( start=-3.0, end=4, size=0.5 ), opacity=0.75 )) fig.update_layout( title_text='RF Histogram of ConfirmedCases', # title of plot xaxis_title_text='bins', # xaxis label yaxis_title_text='ConfirmedCases', # yaxis label bargap=0.2, # gap between bars of adjacent location coordinates bargroupgap=0.1 # gap between bars of the same location coordinates ) fig.show()",No,5,84.0 "from sklearn.metrics import make_scorer, accuracy_score accuracy_score(train_numeric_Y['ConfirmedCases'], predicted_x[:,0]), accuracy_score(train_numeric_Y['Fatalities'], predicted_x[:,1])",No,4,49.0 " from sklearn.model_selection import KFold kf = KFold(n_splits=10) outcomes = [] fold = 0 for train_index, test_index in kf.split(train_numeric_X): fold += 1 X_train, X_test = train_numeric_X.values[train_index], train_numeric_X.values[test_index] y_train, y_test = train_numeric_Y['ConfirmedCases'].values[train_index], train_numeric_Y['ConfirmedCases'].values[test_index] RF_model.fit(X_train, y_train) predictions = RF_model.predict(X_test) accuracy = accuracy_score(y_test, predictions) outcomes.append(accuracy) print(""Fold {0} accuracy: {1}"".format(fold, accuracy)) mean_outcome = np.mean(outcomes) print(""\ \ Mean Accuracy: {0}"".format(mean_outcome)) '",No,3,7.0 from sklearn.ensemble import AdaBoostClassifier,No,5,22.0 "adaboost_model_for_ConfirmedCases = AdaBoostClassifier(n_estimators=5) adaboost_model_for_ConfirmedCases.fit(train_numeric_X, train_numeric_Y[numeric_features_Y[0]]) adaboost_model_for_Fatalities = AdaBoostClassifier(n_estimators=5) adaboost_model_for_Fatalities.fit(train_numeric_X, train_numeric_Y[numeric_features_Y[1]])",No,4,7.0 "predicted_x1 = adaboost_model_for_ConfirmedCases.predict(train_numeric_X) predicted_x2 = adaboost_model_for_Fatalities.predict(train_numeric_X) fig = go.Figure() fig.add_trace(go.Scatter( x=train_numeric_Y['ConfirmedCases'], y=train_numeric_Y['Fatalities'], marker=dict(color=""crimson"", size=12), mode=""markers"", name=""actual"", )) fig.add_trace(go.Scatter( x=predicted_x1, y=predicted_x2, marker=dict(color=""lightseagreen"", size=8), mode=""markers"", name=""predicted"", )) fig.update_layout(title=""ADB result"", xaxis_title=""ConfirmedCases"", yaxis_title=""Fatalities"") fig.show()'",No,4,56.0 train_y_pred = RF_model.predict(train_numeric_X),No,5,27.0 "plt.figure(figsize=(12,8)) plt.hist([train_numeric_Y['ConfirmedCases'],train_y_pred[:,0]],bins=100, range=(1,100), label=['ConfirmedCases_actual','ConfirmedCases_pred'],alpha=0.75) plt.title('ConfirmedCases Comparison',fontsize=20) plt.xlabel('sample',fontsize=20) plt.ylabel('match',fontsize=20) plt.legend() plt.show()",No,5,33.0 "plt.figure(figsize=(12,8)) plt.hist([train_numeric_Y['Fatalities'],train_y_pred[:,1]],bins=100, range=(1,100), label=['Fatalities_actual','Fatalities_pred'],alpha=0.75) plt.title('Fatalities Comparison',fontsize=20) plt.xlabel('sample',fontsize=20) plt.ylabel('match',fontsize=20) plt.legend() plt.show()",No,5,56.0 "error = np.sqrt((train_y_pred - train_numeric_Y)**2) error = error.cumsum()",No,5,28.0 "fig,ax = plt.subplots() plt.xlabel('sample') plt.ylabel('error') plt.subplot(2, 1, 1) plt.plot(range(len(error)), error['ConfirmedCases'], ""x-"",label=""ConfirmedCases"",color='orange') plt.legend() plt.subplot(2, 1, 2) plt.plot(range(len(error)), error['Fatalities'], ""+-"", label=""Fatalities"") plt.legend() plt.show()'",No,5,56.0 "from sklearn.metrics import mean_squared_error rmse = mean_squared_error(train_numeric_Y, train_y_pred , squared=False) rmse",No,4,49.0 "plt.bar(range(len(numeric_features_X)), RF_model.feature_importances_, tick_label=numeric_features_X) plt.xlabel('feature') plt.ylabel('weight') plt.xticks(rotation=90) plt.show()",No,5,33.0 "import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) ",No,5,88.0 "DT=1 optimize_model=False Make_submission=True n_estimators=200 #400 #500 #1500 max_depth=2 #4 #12 #8",No,5,59.0 "train=pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/train.csv"") train.head()",No,4,45.0 "train[train['Province_State'].notna()].groupby(['Country_Region'], sort=False)['Province_State'].nunique()",No,5,54.0 "def add_location(df_old): df=df_old.copy() df['Date']=pd.to_datetime(df['Date']) df['Country_Region']=df['Country_Region'].fillna('') df['Province_State']=df['Province_State'].fillna('') df['location']=df['Province_State'].astype('str')+"" ""+df['Country_Region'].astype('str') return df'",No,3,17.0 train=add_location(train),No,5,8.0 "Confirm_pivot=pd.pivot_table(train_df,index='Date',columns='Country_Region', values='ConfirmedCases',aggfunc=np.sum)",No,5,53.0 "plt.figure(figsize=(16,8)) colors=['r','b','g','y','orange','purple','m','hotpink','violet','darkgreen','navy','brown'] for i,country in enumerate(list_countries): Confirm=Confirm_pivot[Confirm_pivot[country]>0][country].diff().fillna(0) Confirm=Confirm[Confirm>0] Confirm.plot(color=colors[i],label=country,markersize=12,lw=5) plt.title('Number of Daily Cases',fontsize=15) plt.legend(title='country') plt.tight_layout()",No,5,33.0 "plt.figure(figsize=(20,16)) colors=['r','b','g','y','orange','purple','m','hotpink','violet','darkgreen','navy','brown'] for i,country in enumerate(list_countries): Confirm=Confirm_pivot[Confirm_pivot[country]>0][country].diff().fillna(0) Confirm=Confirm[Confirm>0] plt.subplot(4,3,i+1) Confirm.plot(color=colors[i],label=country,markersize=12,lw=5) plt.xticks() plt.legend(title='Country') plt.title('Number of Daily Cases in {}'.format(country.upper())) plt.tight_layout()",No,5,33.0 "train.set_index('location',inplace=True) train['day_of_year']=train['Date'].dt.dayofyear train['day_of_week']=train['Date'].dt.dayofweek first_day=train[(train['ConfirmedCases']>0)].groupby(['location'], sort=False)['day_of_year'].min() first_day.rename('first_day',inplace=True)",No,4,60.0 "def add_days_passed(df_old,first_day): df=df_old.copy() df=pd.concat([df,first_day],axis=1,join='inner') df['days_passed']=df['day_of_year']-df['first_day'] df.drop(columns=['first_day'],inplace=True) df['location']=df.index df.set_index('Id',inplace=True) df['Id']=df.index return df ",No,4,8.0 "train=add_days_passed(train,first_day) train.head()",No,5,8.0 "country_stat=pd.read_csv('../input/countryinfo/covid19countryinfo.csv') country_stat = country_stat[country_stat['region'].isnull()] def add_country_stat(old_df,country_stat): df=old_df.copy() df=df.merge(country_stat[['country','pop','medianage','sex65plus','lung','smokers','density']],left_on=['Country_Region'],right_on=['country'],how='left') df.drop(columns=['country'],inplace=True) df['pop']=df['pop'].fillna(1000) df['pop']=df['pop'].apply(lambda x: int(str(x).replace(',', ''))) #df['gdp2019']=df['gdp2019'].fillna(0) #df['gdp2019']=df['gdp2019'].apply(lambda x: int(str(x).replace(',', ''))) #df['gdp2019']=df['gdp2019']/df['pop'] df['density']=df['density'].fillna(0) df['medianage']=df['medianage'].fillna(0) #df['sexratio']=df['sexratio'].fillna(1) df['sex65plus']=df['sex65plus'].fillna(1) df['lung']=df['lung'].fillna(24) df['smokers']=df['smokers'].fillna(24) #df['lung']=df['lung']*df['pop'] return df train=add_country_stat(train,country_stat)",Yes,3,45.0 country_stat.info(),No,5,40.0 "border_info=pd.read_csv(""https://raw.githubusercontent.com/geodatasource/country-borders/master/GEODATASOURCE-COUNTRY-BORDERS.CSV"") border_info.drop(columns=[""country_code"",""country_border_code""],inplace=True) border_info.replace({'United States of America':'US', 'United Kingdom of Great Britain and Northern Ireland':'United Kingdom', 'Bolivia (Plurinational State Of)':'Bolivia', 'Brunei Darussalam':'Brunei', 'Gambia (the)':'Gambia', 'Congo':'Congo (Kinshasa)', 'Cote dIvoire':""Cote d'Ivoire"", ""Iran (Islamic Republic of)"":'Iran', ""Korea (the Republic of)"":'Korea, South', ""Lao People's Democratic Republic"":'Laos', ""Moldova (the Republic of)"":'Moldova', ""Russian Federation"":'Russia', ""Syrian Arab Republic"":'Syria', ""Taiwan (Province of China)"":'Taiwan*', ""Tanzania (the United Republic of)"":'Tanzania', ""Venezuela (Bolivarian Republic of)"":'Venezuela', ""Viet Nam"":'Vietnam'},inplace=True) border_info=border_info.fillna("""") border_info.to_csv(""border_info.csv"") '",Yes,2,45.0 "from itertools import product as it_product def expand_grid(data_dict): rows = it_product(*data_dict.values()) return pd.DataFrame.from_records(rows, columns=data_dict.keys())",No,4,12.0 "skel=expand_grid({'Index':border_info.index,'Date':train['Date'].unique()}) skel.info()",No,4,12.0 "country_info=train.groupby(['Date','Country_Region'])['ConfirmedCases'].sum()",No,5,60.0 "skel=expand_grid({'Index':border_info.index,'Date':train['Date'].unique()}) skel=skel.merge(border_info, how='inner', left_on=['Index'],right_index=True) skel=skel.merge(country_info, how='inner', left_on=['Date','country_border_name'],right_on=['Date','Country_Region'])",No,4,32.0 "from datetime import timedelta skel['Date']=skel['Date']+timedelta(days=DT) border_cases=skel.groupby(['country_name','Date'])['ConfirmedCases'].sum() len(skel['country_name'].unique())",No,3,8.0 "train=train.merge(border_cases, how='left', left_on=['Country_Region','Date'],right_on=['country_name','Date']) train['ConfirmedCases_y']=train['ConfirmedCases_y'].fillna(0) train.rename(columns={'ConfirmedCases_y':'ConfirmedCases_neighbors','ConfirmedCases_x':'ConfirmedCases'},inplace=True)",No,4,32.0 "big_train = pd.concat([train,pd.get_dummies(train['location'], prefix='loc')],axis=1) big_train['ConfirmedCases_neighbors']=np.log1p(big_train['ConfirmedCases_neighbors']) big_train.reset_index(inplace=True) big_train.drop(columns=[""Id""],inplace=True)'",No,3,11.0 big_train.shape,No,5,58.0 "def df_add_deltas(df_old): df=df_old.copy() df=df.sort_values(by=['location', 'Date']) df['d_ConfirmedCases'] = df.groupby(['location'])['ConfirmedCases'].diff() df['d_Fatalities'] = df.groupby(['location'])['Fatalities'].diff() df.loc[df['d_Fatalities']<0,'d_Fatalities']=0 df.loc[df['d_ConfirmedCases']<0,'d_ConfirmedCases']=0 df['prev_ConfirmedCases']=df['ConfirmedCases']-df['d_ConfirmedCases'] df['prev_Fatalities']=df['Fatalities']-df['d_Fatalities'] df['prev_ConfirmedCases']=np.log1p(df['prev_ConfirmedCases']) df['prev_Fatalities']=np.log1p(df['prev_Fatalities']) first_day_stat=df[df['Date']=='2020-01-22'] df.drop(df[df['Date']=='2020-01-22'].index, inplace=True) return df,first_day_stat ",No,5,8.0 "big_train,first_day_stat=df_add_deltas(big_train)",No,5,8.0 "big_train.reset_index(inplace=True,drop=True)",No,5,84.0 "X=big_train.drop(columns=['Province_State','Country_Region','Date','ConfirmedCases','Fatalities','location', 'd_ConfirmedCases','d_Fatalities']) y=big_train['d_ConfirmedCases'] y_2=big_train['d_Fatalities']",No,4,10.0 "max_day=X['day_of_year'].max() mask_train=X['day_of_year']=max_day-DT+1",No,4,8.0 "X_train=X[mask_train] X_test=X[mask_test] y_train=y[mask_train] y_test=y[mask_test] y_train_2=y_2[mask_train] y_test_2=y_2[mask_test]",No,5,21.0 X_test['day_of_year'].nunique(),No,5,54.0 "X_train.drop(columns=['day_of_year'],inplace=True) #including day of year makes things worse RMLSE goes up from 0.49 to 0.7 X_test.drop(columns=['day_of_year'],inplace=True) #including day of year makes things worse RMLSE goes up from 0.49 to 0.7 X_train.drop(columns=['day_of_week'],inplace=True) #including day of week makes things worse RMLSE goes up from 0.49 to 0.57 X_test.drop(columns=['day_of_week'],inplace=True) #including day of week makes things worse RMLSE goes up from 0.49 to 0.57 X.drop(columns=['day_of_year'],inplace=True) X.drop(columns=['day_of_week'],inplace=True) ",No,5,10.0 "X.drop(columns=['index'],inplace=True) X_train.drop(columns=['index'],inplace=True) X_test.drop(columns=['index'],inplace=True)",No,5,10.0 "# Best: -0.252369 using {'max_depth': 6, 'n_estimators': 1500} # Best: -1.051575 using {'max_depth': 6, 'n_estimators': 500} - predict shifts log # Best: -278.598983 using {'max_depth': 10, 'n_estimators': 500} - predict values # Best: -1.111758 using {'max_depth': 6, 'n_estimators': 500} - predict shifts log, knowing prev log import xgboost as xgb from sklearn.model_selection import GridSearchCV if optimize_model: model = xgb.XGBRegressor(random_state=42) n_estimators_grid = [500, 1000,1500] max_depth_grid = [6, 8, 10] param_grid = dict(max_depth=max_depth_grid, n_estimators=n_estimators_grid) grid_search = GridSearchCV(model, param_grid, scoring=""neg_root_mean_squared_error"", n_jobs=-1, cv=[(X[mask_train].index,X[mask_test].index)], verbose=1) grid_result = grid_search.fit(X,np.log1p(y)) # summarize results print(""Best: %f using %s"" % (grid_result.best_score_, grid_result.best_params_)) print(grid_result.cv_results_)'",No,3,6.0 "# Best: -0.211438 using {'max_depth': 5, 'n_estimators': 2500} # Best: -0.974302 using {'max_depth': 5, 'n_estimators': 400} - predict shifts log # Best: -274.964946 using {'max_depth': 12, 'n_estimators': 500} # Best: -1.064197 using {'max_depth': 5, 'n_estimators': 400} if optimize_model: model = xgb.XGBRegressor(random_state=42) n_estimators_grid = [400,500,600] max_depth_grid = [5,6,7] param_grid = dict(max_depth=max_depth_grid, n_estimators=n_estimators_grid) grid_search = GridSearchCV(model, param_grid, scoring=""neg_root_mean_squared_error"", n_jobs=-1, cv=[(X[mask_train].index,X[mask_test].index)], verbose=1) grid_result = grid_search.fit(X,np.log1p(y)) # summarize results print(""Best: %f using %s"" % (grid_result.best_score_, grid_result.best_params_)) print(grid_result.cv_results_)'",Yes,3,6.0 "#Best: -0.211107 using {'max_depth': 5, 'n_estimators': 3000} #Best: -0.940498 using {'max_depth': 4, 'n_estimators': 400} #Best: -274.964946 using {'max_depth': 12, 'n_estimators': 500} #Best: -0.861262 using {'max_depth': 2, 'n_estimators': 200} if optimize_model: model = xgb.XGBRegressor(random_state=42) n_estimators_grid = [150,200,250] max_depth_grid = [1,2,3] param_grid = dict(max_depth=max_depth_grid, n_estimators=n_estimators_grid) grid_search = GridSearchCV(model, param_grid, scoring=""neg_root_mean_squared_error"", n_jobs=-1, cv=[(X[mask_train].index,X[mask_test].index)], verbose=1) grid_result = grid_search.fit(X,np.log1p(y)) # summarize results print(""Best: %f using %s"" % (grid_result.best_score_, grid_result.best_params_)) print(grid_result.cv_results_)'",Yes,3,6.0 "reg.fit(X_train,np.log1p(y_train))",No,5,7.0 y_pred = reg.predict(X_test),No,5,48.0 "from sklearn.metrics import mean_squared_error np.sqrt(mean_squared_error(y_pred,np.log1p(y_test)))",No,5,49.0 "X_train_2=X_train.copy() X_train_2['d_confirmed']=y_train #0.4412899060661785 <- without , with - 0.4463 X_test_2=X_test.copy() X_test_2['d_confirmed']=y_pred",No,4,49.0 "reg_2.fit(X_train_2,np.log1p(y_train_2))",No,5,7.0 y_pred_2 = reg_2.predict(X_test_2),No,5,48.0 "np.sqrt(mean_squared_error(y_pred_2,np.log1p(y_test_2)))",No,5,49.0 "test=pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-2/test.csv"") test.rename(columns={'ForecastId':'Id'},inplace=True) test=add_location(test) test.set_index('location',inplace=True) test['day_of_year']=test['Date'].dt.dayofyear test['day_of_week']=test['Date'].dt.dayofweek test=add_days_passed(test,first_day) test=add_country_stat(test,country_stat)'",Yes,2,45.0 "big_train=big_train.drop(columns=[""index""]) ",No,5,10.0 "known=big_train['Date'].unique() print(known)",No,5,57.0 "if Make_submission==True: results=[] for d in days_to_predict: print(""Predicting {}"".format(d)) if d in known: print(""Data Known"") X=big_train.drop(columns=['Province_State','Country_Region','ConfirmedCases','Fatalities','location','Date', 'day_of_year','day_of_week','d_ConfirmedCases','d_Fatalities']) y=big_train['d_ConfirmedCases'] y_2=big_train['d_Fatalities'] mask_train=big_train['Date']> Prediction of confirmed cases # This script trains the model on the latest dataset and predicts the next value # Author: Neilay Khasnabish # Import libraries import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import AdaBoostRegressor import matplotlib.pyplot as plt from sklearn.model_selection import RandomizedSearchCV import tqdm as tqdm # Making Kaggle dataset ready def kaggle(dfTrain, dfTest): pd.set_option('display.max_columns', None) dfTest['DateNew'] = pd.to_datetime(dfTest['Date']) dfTest = dfTest.drop(['Date'], axis=1) dfTest = dfTest.rename(columns={""DateNew"": ""Date""}) dfTest['Year'] = dfTest['Date'].dt.year dfTest['Month'] = dfTest['Date'].dt.month dfTest['Day'] = dfTest['Date'].dt.day dfTest = dfTest.drop(['Date'], axis=1) dfTest = dfTest.fillna('DummyProvince') #dfTest.to_csv('G:/Kaggle_COVID/covid19-global-forecasting-week-2/dummyTest.csv') dfTrain['DateNew'] = pd.to_datetime(dfTrain['Date']) dfTrain = dfTrain.drop(['Date'], axis=1) dfTrain = dfTrain.rename(columns={""DateNew"": ""Date""}) dfTrain['Year'] = dfTrain['Date'].dt.year dfTrain['Month'] = dfTrain['Date'].dt.month dfTrain['Day'] = dfTrain['Date'].dt.day dfTrain = dfTrain.drop(['Date'], axis=1) dfTrain = dfTrain.fillna('DummyProvince') #dfTrain.to_csv('G:/Kaggle_COVID/covid19-global-forecasting-week-2/dummyTrain.csv') result = pd.merge(dfTest, dfTrain, how='left', on=['Country_Region', 'Province_State', 'Year', 'Month', 'Day']) result = result.fillna(-1) # Clutter removal [rr, cc] = np.shape(result) for iQuit in range(rr): if result.loc[iQuit, 'Day'] == 4 : result.loc[iQuit, 'ConfirmedCases'] = -1 result.loc[iQuit, 'Fatalities'] = -1 #result.to_csv('G:/Kaggle_COVID/covid19-global-forecasting-week-2/temp.csv') return result # Finding RMSE def ErrorCalc(mdl, ref, tag): relError = np.abs(mdl - ref)/ np.abs(ref+1) MeanErrorV = np.mean(relError) print(tag + ': Mean Rel Error in %: ', MeanErrorV * 100) return MeanErrorV # Since cumulative prediction >> This script is not used for Kaggle dataset def AdjustingErrorsOutliers(tempPred, df) : tempPred = np.round(tempPred) tempPrev = df['day5'].to_numpy() # Next cumulative prediction must be more than or equal to previous for i in range(len(tempPred)): if tempPred[i] < tempPrev[i] : # Since cumulative prediction tempPred[i] = tempPrev[i] return tempPred # Train model def TrainMdl (trainIpData, trainOpData) : testSize = 0.1 # 90:10 ratio >> for final testing print('Training starts ...') randomState=None # randomState = 42 # For train test split # Final validation X_train, X_test, y_train, y_test = train_test_split(trainIpData, trainOpData, test_size=testSize, random_state=randomState) # Another set of input TrainIP = X_train[['day1', 'day2', 'day3', 'day4', 'day5', 'diff1', 'diff2', 'diff3', 'diff4']] TrainOP = X_train['gammaFun'] TestIP = X_test[['day1', 'day2', 'day3', 'day4', 'day5', 'diff1', 'diff2', 'diff3', 'diff4']] TestOP = X_test['gammaFun'] # Adaboost Regressor >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> treeDepth = 10 # Fixed mdl = DecisionTreeRegressor(max_depth=treeDepth) # This is fixed param_grid = { 'n_estimators': [100, 250, 500], 'learning_rate': [0.1, 0.01, 0.001] } regrMdl = AdaBoostRegressor(base_estimator=mdl) clf = RandomizedSearchCV(estimator = regrMdl, param_distributions = param_grid, n_iter = 100, cv = 3, verbose=0, random_state=42, n_jobs = -1) clf.fit(TrainIP, TrainOP) # Calculating Error y_predictedTrain = clf.predict(TrainIP) # Predicting the gamma function y_predictedTrain = AdjustingErrorsOutliers(y_predictedTrain * TrainIP['day5'].to_numpy(), TrainIP) ErrorCalc(y_predictedTrain, y_train.to_numpy(), 'Train Data-set') # y_predictedTrain converted to numbers y_predictedTest = clf.predict(TestIP) # Predicting the gamma function y_predictedTest = AdjustingErrorsOutliers(y_predictedTest * TestIP['day5'].to_numpy(), TestIP) ErrorCalc(y_predictedTest, y_test.to_numpy(), 'Validation Data-set ') # y_predictedTest converted to numbers print('-----------------------------------------------------------') # Read Kaggle dataset dfTrain = pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"") dfTest = pd.read_csv(""../input/covid19-global-forecasting-week-2/test.csv"") df = kaggle(dfTrain, dfTest) print('Prediction starts ...') [rr, cc] = np.shape(df) for iP in range(rr): if df.loc[iP, 'ConfirmedCases'] == -1 : # iP-th position need to be predicted # Create a dataframe day5 = df.loc[iP-1, 'ConfirmedCases'] day4 = df.loc[iP-2, 'ConfirmedCases'] day3 = df.loc[iP-3, 'ConfirmedCases'] day2 = df.loc[iP-4, 'ConfirmedCases'] day1 = df.loc[iP-5, 'ConfirmedCases'] diff1 = day5 - day4 diff2 = day4 - day3 diff3 = day3 - day2 diff4 = day2 - day1 data = {'day1': [day1], 'day2': [day2], 'day3': [day3], 'day4': [day4], 'day5': [day5], 'diff1': [diff1], 'diff2': [diff2], 'diff3': [diff3], 'diff4': [diff4]} dfPredict = pd.DataFrame(data) finalPrediction = clf.predict(dfPredict[['day1', 'day2', 'day3', 'day4', 'day5', 'diff1', 'diff2', 'diff3', 'diff4']]) * day5 if finalPrediction < day5 : finalPrediction = day5 df.loc[iP, 'ConfirmedCases'] = np.round(finalPrediction) # Update the current location return df # Main code starts df = pd.read_csv(""../input/processedtimedata/TrainTest.csv"") # Processed dta from JHU trainIpData = df[['day1', 'day2', 'day3', 'day4', 'day5', 'gammaFun', 'diff1', 'diff2', 'diff3', 'diff4']] trainOpData = df['dayPredict'] # Predicted confirmed case predictions_dF = TrainMdl (trainIpData, trainOpData) # Kaggle data will be read inside print('Completed ...') #predictions_dF[['ForecastId', 'ConfirmedCases', 'Fatalities']].to_csv('G:/Kaggle_COVID/covid19-global-forecasting-week-2/submission_ConfirmedCases.csv', index = False)'",Yes,1,7.0 "# KAGGLE competition >> Fatality rate # This script trains the model on the latest dataset and predicts the next value # Author: Neilay Khasnabish # Import libraries import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import AdaBoostRegressor import matplotlib.pyplot as plt from sklearn.model_selection import RandomizedSearchCV # Making Kaggle dataset ready def kaggle(dfTrain, dfTest): pd.set_option('display.max_columns', None) dfTest['DateNew'] = pd.to_datetime(dfTest['Date']) dfTest = dfTest.drop(['Date'], axis=1) dfTest = dfTest.rename(columns={""DateNew"": ""Date""}) dfTest['Year'] = dfTest['Date'].dt.year dfTest['Month'] = dfTest['Date'].dt.month dfTest['Day'] = dfTest['Date'].dt.day dfTest = dfTest.drop(['Date'], axis=1) dfTest = dfTest.fillna('DummyProvince') #dfTest.to_csv('G:/Kaggle_COVID/covid19-global-forecasting-week-2/dummyTest.csv') dfTrain['DateNew'] = pd.to_datetime(dfTrain['Date']) dfTrain = dfTrain.drop(['Date'], axis=1) dfTrain = dfTrain.rename(columns={""DateNew"": ""Date""}) dfTrain['Year'] = dfTrain['Date'].dt.year dfTrain['Month'] = dfTrain['Date'].dt.month dfTrain['Day'] = dfTrain['Date'].dt.day dfTrain = dfTrain.drop(['Date'], axis=1) dfTrain = dfTrain.fillna('DummyProvince') #dfTrain.to_csv('G:/Kaggle_COVID/covid19-global-forecasting-week-2/dummyTrain.csv') result = pd.merge(dfTest, dfTrain, how='left', on=['Country_Region', 'Province_State', 'Year', 'Month', 'Day']) result = result.fillna(-1) # Clutter removal [rr, cc] = np.shape(result) for iQuit in range(rr): if result.loc[iQuit, 'Day'] == 4 : result.loc[iQuit, 'ConfirmedCases'] = -1 result.loc[iQuit, 'Fatalities'] = -1 #result.to_csv('G:/Kaggle_COVID/covid19-global-forecasting-week-2/temp.csv') return result # Finding RMSE def ErrorCalc(mdl, ref, tag): relError = np.abs(mdl - ref)/ np.abs(ref+1) MeanErrorV = np.mean(relError) print(tag + ': Mean Rel Error in %: ', MeanErrorV * 100) return MeanErrorV # Since cumulative prediction >> This script is not used for Kaggle dataset def AdjustingErrorsOutliers(tempPred, df) : tempPred = np.round(tempPred) tempPrev = df['day5'].to_numpy() # Next cumulative prediction must be more than or equal to previous for i in range(len(tempPred)): if tempPred[i] < tempPrev[i] : # Since cumulative prediction tempPred[i] = tempPrev[i] return tempPred # Train model def TrainMdl (trainIpData, trainOpData) : testSize = 0.1 # 90:10 ratio >> for final testing print('Training starts ...') randomState=None # randomState = 42 # For train test split # Final validation X_train, X_test, y_train, y_test = train_test_split(trainIpData, trainOpData, test_size=testSize, random_state=randomState) # Another set of input TrainIP = X_train[['day1', 'day2', 'day3', 'day4', 'day5', 'diff1', 'diff2', 'diff3', 'diff4']] TrainOP = X_train['gammaFun'] TestIP = X_test[['day1', 'day2', 'day3', 'day4', 'day5', 'diff1', 'diff2', 'diff3', 'diff4']] TestOP = X_test['gammaFun'] # Adaboost Regressor >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> treeDepth = 10 # Fixed mdl = DecisionTreeRegressor(max_depth=treeDepth) # This is fixed param_grid = { 'n_estimators': [100, 250, 500], 'learning_rate': [0.1, 0.01, 0.001] } regrMdl = AdaBoostRegressor(base_estimator=mdl) clf = RandomizedSearchCV(estimator = regrMdl, param_distributions = param_grid, n_iter = 100, cv = 3, verbose=0, random_state=42, n_jobs = -1) clf.fit(TrainIP, TrainOP) # Calculating Error y_predictedTrain = clf.predict(TrainIP) # Predicting the gamma function y_predictedTrain = AdjustingErrorsOutliers(y_predictedTrain * TrainIP['day5'].to_numpy(), TrainIP) ErrorCalc(y_predictedTrain, y_train.to_numpy(), 'Train Data-set') # y_predictedTrain converted to numbers y_predictedTest = clf.predict(TestIP) # Predicting the gamma function y_predictedTest = AdjustingErrorsOutliers(y_predictedTest * TestIP['day5'].to_numpy(), TestIP) ErrorCalc(y_predictedTest, y_test.to_numpy(), 'Validation Data-set ') # y_predictedTest converted to numbers print('-----------------------------------------------------------') # Read Kaggle dataset dfTrain = pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"") dfTest = pd.read_csv(""../input/covid19-global-forecasting-week-2/test.csv"") df = kaggle(dfTrain, dfTest) [rr, cc] = np.shape(df) for iP in range(rr): if df.loc[iP, 'Fatalities'] == -1 : # iP-th position need to be predicted # Create a dataframe day5 = df.loc[iP-1, 'Fatalities'] day4 = df.loc[iP-2, 'Fatalities'] day3 = df.loc[iP-3, 'Fatalities'] day2 = df.loc[iP-4, 'Fatalities'] day1 = df.loc[iP-5, 'Fatalities'] diff1 = day5 - day4 diff2 = day4 - day3 diff3 = day3 - day2 diff4 = day2 - day1 data = {'day1': [day1], 'day2': [day2], 'day3': [day3], 'day4': [day4], 'day5': [day5], 'diff1': [diff1], 'diff2': [diff2], 'diff3': [diff3], 'diff4': [diff4]} dfPredict = pd.DataFrame(data) finalPrediction = clf.predict(dfPredict[['day1', 'day2', 'day3', 'day4', 'day5', 'diff1', 'diff2', 'diff3', 'diff4']]) * day5 if finalPrediction < day5 : finalPrediction = day5 df.loc[iP, 'Fatalities'] = np.round(finalPrediction) # Update the current location return df # Main code starts df = pd.read_csv(""../input/processedtimedata/TrainTest_Fatality.csv"") # Processed dta from JHU trainIpData = df[['day1', 'day2', 'day3', 'day4', 'day5', 'gammaFun', 'diff1', 'diff2', 'diff3', 'diff4']] trainOpData = df['dayPredict'] # Predicted fatality fatality_dF = TrainMdl (trainIpData, trainOpData) # Kaggle data will be read inside print('Completed ...') #predictions_dF[['ForecastId', 'ConfirmedCases', 'Fatalities']].to_csv('G:/Kaggle_COVID/covid19-global-forecasting-week-2/submission_Fatality.csv', index = False)'",Yes,1,7.0 "# Creating the submission predictions_dF['Fatalities'] = fatality_dF['Fatalities'] predictions_dF[['ForecastId', 'ConfirmedCases', 'Fatalities']].to_csv('submission.csv', index=False) print(predictions_dF[['ForecastId', 'ConfirmedCases', 'Fatalities']].head(10)) print(predictions_dF[['ForecastId', 'ConfirmedCases', 'Fatalities']].tail(10))",No,4,25.0 "b""# import necessary modules\nimport numpy as np \nimport pandas as pd \nimport matplotlib.pyplot as plt\nimport os\nimport warnings\nfrom datetime import datetime\nfrom scipy import stats\nfrom scipy.stats import norm, skew, probplot \n\nwarnings.filterwarnings('ignore')\ndaybasecount = 4 #antal dage der summeres over til estimat for kurvefaktorer\nbaseday = 98 - float(daybasecount-1)/2. #89.5 # var 86 #den dag i ret hvor der regnes ud fra\nexponent = 1./float(daybasecount) #exponent der overstter daybasecount increase til daglig increase\nfatalityBaseDayShift = 10 #antal dage baslns der beregnes ddsfald fra\nmaxincrease = 140 # strste tilladte increase mlt i procent over 4 dage\nmaxDeadPrDay = 1500""",No,4,77.0 "#print(os.listdir(""../kaggle-Covid19/covid19-global-forecasting-week-2"")) dftrain = pd.read_csv('../input/covid19-global-forecasting-week-3/train.csv', parse_dates=['Date']).sort_values(by=['Country_Region', 'Date']) dftest = pd.read_csv('../input/covid19-global-forecasting-week-3/test.csv', parse_dates=['Date']).sort_values(by=['Country_Region', 'Date']) # CURVE SMOOTHING #Add averages #dftrain ['Cases_m'] = dftrain.groupby(['Country_Region', 'Province_State'])[['ConfirmedCases']].transform(lambda x: x.shift(1)) #dftrain ['Cases_p'] = dftrain.groupby(['Country_Region', 'Province_State'])[['ConfirmedCases']].transform(lambda x: x.shift(-1)) #dftrain ['Cases_ave'] = 0.5*(dftrain['ConfirmedCases']+0.5*(dftrain['Cases_p']+dftrain['Cases_m'])) #case_cols = ['ConfirmedCases','Cases_m','Cases_p','Cases_ave'] #dftrain ['Fatalities_m'] = dftrain.groupby(['Country_Region', 'Province_State'])[['Fatalities']].transform(lambda x: x.shift(1)) #dftrain ['Fatalities_p'] = dftrain.groupby(['Country_Region', 'Province_State'])[['Fatalities']].transform(lambda x: x.shift(-1)) #dftrain ['Fatalities_ave'] = 0.5*(dftrain['Fatalities']+0.5*(dftrain['Fatalities_p']+dftrain['Fatalities_m'])) #fata_cols = ['Fatalities','Fatalities_m','Fatalities_p','Fatalities_ave'] #date_max = dftrain[""Date""].max() #mask = df[""Date""]==date_max #dftrain.loc[mask,'Cases_ave'] = 0.75*dftrain.loc[mask,'ConfirmedCases']+0.25*dftrain.loc[mask,'Cases_m'] #dftrain.loc[mask,'Fatalities_ave'] = 0.75*dftrain.loc[mask,'Fatalities'] +0.25*dftrain.loc[mask,'Fatalities_m'] #dftrain.drop(['Cases_m', 'Cases_p', 'Fatalities_m','Fatalities_p'],axis=1,inplace=True) #dftrain.drop(['ConfirmedCases','Fatalities'],axis=1,inplace = True) #dftrain.rename(columns={'Cases_ave':'ConfirmedCases','Fatalities_ave':'Fatalities'},inplace=True) #dftrain['ConfirmedCases','Fatalities'].fillna(0, inplace=True) ppp_tabel = pd.read_csv('../input/country-ppp/Country_PPP.csv', sep='\\s+')#.sort_values(by=['Country']) ppp_tabel.drop('Id', 1,inplace=True) ppp_tabel = ppp_tabel.append({'Country' : 'Burma' , 'ppp' : 8000} , ignore_index=True) ppp_tabel = ppp_tabel.append({'Country' : 'MS_Zaandam' , 'ppp' : 40000} , ignore_index=True) ppp_tabel = ppp_tabel.append({'Country' : 'West_Bank_and_Gaza' , 'ppp' : 20000} , ignore_index=True) ppp_tabel[""Country""].replace( '_',' ', regex=True,inplace=True) # _ var indfrt for at f den til at lse ppp_tabel[""Country""].replace( 'United States','US', regex=True,inplace=True) # _ var indfrt for at f den til at lse ppp_tabel.rename(columns={'Country':'Country_Region'},inplace=True) ppp_tabel.sort_values('Country_Region',inplace=True)'",No,4,45.0 "dftrain['Dayofyear'] = dftrain['Date'].dt.dayofyear dftest['Dayofyear'] = dftest['Date'].dt.dayofyear dftest['Expo'] = dftest['Dayofyear']-baseday print(dftrain.tail(5)) dftest = dftest.merge(dftrain[['Country_Region','Province_State','Date','ConfirmedCases','Fatalities']] , on=['Country_Region','Province_State','Date'], how='left', indicator=True) ",No,4,32.0 "#dftrain = dftrain.loc[dftrain['Country_Region'] == 'Denmark'] dftrain['Province_State'].fillna(dftrain['Country_Region'], inplace=True) dftest ['Province_State_orig'] = dftest ['Province_State'] dftest ['Province_State'].fillna(dftest['Country_Region'], inplace=True) dftrain.sort_values(by =['Country_Region', 'Province_State','Date'], inplace=True) dftrain[['NewCases','NewFatalities']] = dftrain.groupby(['Country_Region', 'Province_State'])[['ConfirmedCases','Fatalities']].transform(lambda x: x.diff()) dftrain['FatalityBasis'] = dftrain.groupby(['Country_Region', 'Province_State'])[['ConfirmedCases']].transform(lambda x: x.shift(fatalityBaseDayShift)) #smid alt andet end senete bort dftrain = dftrain.loc[dftrain['Dayofyear'] > 80] #find sidste dato med oplysninger to_sum = ['Country_Region','Province_State','ConfirmedCases','Fatalities'] lastinfo = dftrain.groupby(['Country_Region','Province_State']).tail(1)[to_sum] lastinfo.rename(columns={'ConfirmedCases':'ConfirmedCases_init','Fatalities':'Fatalities_init'},inplace=True) #find gennemsnit af sidste 4(=daybasecount) dage to_sum = ['ConfirmedCases','NewCases','FatalityBasis'] grouped = dftrain.groupby(['Country_Region','Province_State']).tail(daybasecount) grouped_gem = dftrain.groupby(['Country_Region','Province_State'])[to_sum].mean() grouped_gem.reset_index(inplace=True) grouped_gem.rename(columns={'ConfirmedCases':'ConfirmedCases_base','Fatalities':'Fatalities_base' ,'NewCases':'NewCases_base'},inplace=True) grouped_gem = grouped_gem.merge(lastinfo, on=['Country_Region','Province_State'], how='outer', indicator=True) to_sum = ['NewCases','NewFatalities','FatalityBasis'] grouped2 = grouped.groupby(['Country_Region'])[to_sum].sum() grouped2['FatalityPct'] = 100*grouped2['NewFatalities']/grouped2['FatalityBasis'] grouped2.rename(columns={'NewCases':'NewCases2','NewFatalities':'NewFatalities2' ,'FatalityBasis':'FatalityBasis2','FatalityPct':'FatalityPct2'},inplace=True) with_ppp = pd.merge(grouped2, ppp_tabel, on=['Country_Region'], how='outer', indicator=True) missing = with_ppp.loc[with_ppp['ppp'].isnull()] dftrain.head(60) ",Yes,2,60.0 "b""#find gennemsnit af forrige 4(=daybasecount) dage\ngrouped=dftrain.groupby(['Country_Region','Province_State']).tail(daybasecount*2)\ngrouped=grouped.groupby(['Country_Region','Province_State']).head(daybasecount)\ngrouped.drop(['FatalityBasis'],axis=1,inplace=True)\n\nto_sum = ['NewCases','NewFatalities']\ngrouped1 = grouped.groupby(['Country_Region'])[to_sum].sum()\n\ngrouped1.rename(columns={'NewCases':'NewCases1','NewFatalities':'NewFatalities1'}, inplace=True)\n\n# beregn grundlggende increase ud fra sidst og forrige 4(=daybasecount) dage\ngrouped = pd.merge(grouped1, grouped2, on=['Country_Region'])\ngrouped['CasesIncreasePct'] = 100*(grouped['NewCases2']/grouped['NewCases1']-1)\nmask = grouped['CasesIncreasePct'] > maxincrease\ngrouped.loc[mask,'CasesIncreasePct'] = maxincrease\nmask = grouped['CasesIncreasePct'] < 0\ngrouped.loc[mask,'CasesIncreasePct'] = 0\nmask = grouped['CasesIncreasePct'].isnull()\ngrouped.loc[mask,'CasesIncreasePct'] = 0\ngrouped['Factor'] = (grouped['CasesIncreasePct']/100+1)**exponent\n\ngrouped = pd.merge(grouped, ppp_tabel, on=['Country_Region'])\n#grouped['ppp'].isnull().sum()\n\n# afgrns Fatality procent ud fra hndestimerede kurver med ppp \ngrouped['ppp'] = grouped['ppp']/10000.\nif False:\n mask = (grouped['FatalityPct2'] > 9) & (grouped['ppp'] <= 1)\n grouped.loc[mask,'FatalityPct2'] = 5\n mask = (grouped['FatalityPct2'] < 5) & (grouped['ppp'] <= 1)\n grouped.loc[mask,'FatalityPct2'] = 5\n mask = (grouped['FatalityPct2'] > 6) & (grouped['ppp'] >= 7)\n grouped.loc[mask,'FatalityPct2'] = 6\n mask = (grouped['FatalityPct2'] < 1.5) & (grouped['ppp'] >= 7)\n grouped.loc[mask,'FatalityPct2'] = 1.5\n mask = (grouped['FatalityPct2'] > (9.5 - 0.43*grouped['ppp'])) & (grouped['ppp'] > 1) & (grouped['ppp'] < 7)\n grouped.loc[mask,'FatalityPct2'] = (9.5 - 0.43*grouped['ppp'])\n mask = (grouped['FatalityPct2'] < (5.6 - 0.5*grouped['ppp'])) & (grouped['ppp'] > 1) & (grouped['ppp'] < 7)\n grouped.loc[mask,'FatalityPct2'] = (5.6 - 0.5*grouped['ppp'])\n mask = (grouped['FatalityPct2'].isnull()) & (grouped['ppp'] <= 1)\n grouped.loc[mask,'FatalityPct2'] = 7\n mask = (grouped['FatalityPct2'].isnull()) & (grouped['ppp'] >= 7)\n grouped.loc[mask,'FatalityPct2'] = 4\n mask = (grouped['FatalityPct2'].isnull()) & (grouped['ppp'] > 1) & (grouped['ppp'] < 7)\n grouped.loc[mask,'FatalityPct2'] = (7.5 - 0.5*grouped['ppp'])\nelse:\n mask = (grouped['FatalityPct2'] > 4) & (grouped['ppp'] <= 1)\n grouped.loc[mask,'FatalityPct2'] = 3\n mask = (grouped['FatalityPct2'] < 1) & (grouped['ppp'] <= 1)\n grouped.loc[mask,'FatalityPct2'] = 2\n mask = (grouped['FatalityPct2'] > 1.5) & (grouped['ppp'] >= 7)\n grouped.loc[mask,'FatalityPct2'] = 1.5\n mask = (grouped['FatalityPct2'] < 0.5) & (grouped['ppp'] >= 7)\n grouped.loc[mask,'FatalityPct2'] = 0.5\n mask = (grouped['FatalityPct2'] > (4.5 - 0.43*grouped['ppp'])) & (grouped['ppp'] > 1) & (grouped['ppp'] < 7)\n grouped.loc[mask,'FatalityPct2'] = (4.5 - 0.43*grouped['ppp'])\n mask = (grouped['FatalityPct2'] < (1.1 - 0.1*grouped['ppp'])) & (grouped['ppp'] > 1) & (grouped['ppp'] < 7)\n grouped.loc[mask,'FatalityPct2'] = (1.1 - 0.1*grouped['ppp'])\n mask = (grouped['FatalityPct2'].isnull()) & (grouped['ppp'] <= 1)\n grouped.loc[mask,'FatalityPct2'] = 3\n mask = (grouped['FatalityPct2'].isnull()) & (grouped['ppp'] >= 7)\n grouped.loc[mask,'FatalityPct2'] = 1\n mask = (grouped['FatalityPct2'].isnull()) & (grouped['ppp'] > 1) & (grouped['ppp'] < 7)\n grouped.loc[mask,'FatalityPct2'] = (2.6 - 0.23*grouped['ppp'])""",Yes,2,14.0 "dftest.drop('_merge',axis=1,inplace= True) dftest = dftest.merge(grouped[['Country_Region','FatalityPct2','Factor']], on=['Country_Region'], how='left') dftest = dftest.merge(grouped_gem[['Province_State','Country_Region','ConfirmedCases_base','ConfirmedCases_init','NewCases_base','Fatalities_init','FatalityBasis']], on=['Province_State','Country_Region'], how='left') ",No,4,32.0 "dftest['ConfirmedCases_shift'] = dftest.groupby(['Country_Region', 'Province_State'])[['ConfirmedCases']].transform(lambda x: x.shift(1)) mask = dftest['ConfirmedCases'].isnull() # find new cases dftest.loc[mask,'NewCases'] = dftest.loc[mask,'NewCases_base']*(dftest.loc[mask,'Factor']**dftest.loc[mask,'Expo']) #dftest.loc[mask,'Confirmed'] = dftest.loc[mask,'FatalityBasis2']*(dftest.loc[mask,'Factor']**dftest.loc[mask,'Expo']) dftest['NewCases_cum'] = dftest.groupby(['Country_Region', 'Province_State'])[['NewCases']].cumsum() #transform(lambda x: x.shift(1)) dftest.loc[mask,'ConfirmedCases'] = dftest.loc[mask,'ConfirmedCases_init'] + dftest.loc[mask,'NewCases_cum'] #juster Fatality nr der er rigtig mange cases dvs. der testes meget mask3 = dftest['ConfirmedCases'] > 400000 dftest.loc[mask3,'FatalityPct2'] = dftest.loc[mask3,'FatalityPct2']*0.7 mask4 = dftest['ConfirmedCases'] > 800000 dftest.loc[mask4,'FatalityPct2'] = dftest.loc[mask4,'FatalityPct2']*0.7 dftest['FatalityBasis'] = dftest.groupby(['Country_Region', 'Province_State'])[ ['ConfirmedCases']].transform(lambda x: x.shift(10)) dftest.loc[mask,'NewFatalities'] = dftest.loc[mask,'FatalityBasis'] * dftest.loc[mask,'FatalityPct2']/100 # st max tal for antal dde pr. dag mask2 = dftest['NewFatalities'] > maxDeadPrDay dftest.loc[mask2,'NewFatalities'] = maxDeadPrDay #print(""MASK2"",mask2.sum()) dftest['NewFatalities_cum'] = dftest.groupby(['Country_Region', 'Province_State'])[['NewFatalities']].cumsum() #transform(lambda x: x.shift(1)) dftest.loc[mask,'Fatalities'] = dftest.loc[mask,'Fatalities_init'] + dftest.loc[mask,'NewFatalities_cum'] '",Yes,3,8.0 "# Forbered aflevering dftest.drop(['Dayofyear', 'Expo','FatalityPct2', 'Factor', 'ConfirmedCases_base', 'ConfirmedCases_init', 'NewCases_base', 'Fatalities_init', 'FatalityBasis', 'ConfirmedCases_shift', 'NewCases', 'NewCases_cum', 'NewFatalities','NewFatalities_cum'],axis=1,inplace=True) final = dftest.groupby(['Country_Region','Province_State']).tail(1) dftest.drop(['Province_State'],axis=1,inplace=True) dftest.rename(columns={'Province_State_orig':'Province_State'},inplace=True)",No,4,10.0 "mask = dftest[""ConfirmedCases""].isnull() print(mask.sum()) errors = dftest.loc[mask] print(errors) mask = dftest[""Fatalities""].isnull() print(mask.sum()) errors = dftest.loc[mask] print(errors) dftest.drop(['Province_State','Country_Region','Date'],axis=1,inplace=True) print(""dftest columns ="",dftest.columns) '",Yes,4,39.0 "#print(dftest[dftest['Country_Region']=='Burma']) dftest.ForecastId = dftest.ForecastId.astype('int') dftest['ConfirmedCases'] = dftest['ConfirmedCases'].round().astype(int) dftest['Fatalities'] = dftest['Fatalities'].round().astype(int) dftest.to_csv('submission.csv', index=False) ",Yes,4,25.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn import preprocessing import time from datetime import datetime from scipy import integrate, optimize import warnings warnings.filterwarnings('ignore') # ML libraries import lightgbm as lgb import xgboost as xgb from xgboost import plot_importance, plot_tree from sklearn.model_selection import RandomizedSearchCV, GridSearchCV from sklearn import linear_model from sklearn.metrics import mean_squared_error from sklearn.linear_model import Ridge, RidgeCV",No,5,23.0 "submission_example = pd.read_csv(""../input/covid19-global-forecasting-week-2/submission.csv"") test = pd.read_csv(""../input/covid19-global-forecasting-week-2/test.csv"") train = pd.read_csv(""../input/covid19-global-forecasting-week-2/train.csv"") display(train.head(5)) display(train.describe()) print(""Number of Country_Region: "", train['Country_Region'].nunique()) print(""Dates go from day"", max(train['Date']), ""to day"", min(train['Date']), "", a total of"", train['Date'].nunique(), ""days"") print(""Countries with Province/State informed: "", train[train['Province_State'].isna()==False]['Country_Region'].unique())'",No,4,45.0 "# Merge train and test, exclude overlap dates_overlap = ['2020-03-19','2020-03-20','2020-03-21','2020-03-22','2020-03-23', '2020-03-24', '2020-03-25', '2020-03-26', '2020-03-27', '2020-03-28', '2020-03-29', '2020-03-30', '2020-03-31'] train2 = train.loc[~train['Date'].isin(dates_overlap)] all_data = pd.concat([train2, test], axis = 0, sort=False) # Double check that there are no informed ConfirmedCases and Fatalities after 2020-03-11 all_data.loc[all_data['Date'] >= '2020-03-19', 'ConfirmedCases'] = np.nan all_data.loc[all_data['Date'] >= '2020-03-19', 'Fatalities'] = np.nan all_data['Date'] = pd.to_datetime(all_data['Date']) # Create date columns le = preprocessing.LabelEncoder() all_data['Day_num'] = le.fit_transform(all_data.Date) all_data['Day'] = all_data['Date'].dt.day all_data['Month'] = all_data['Date'].dt.month all_data['Year'] = all_data['Date'].dt.year # Fill null values given that we merged train-test datasets all_data['Province_State'].fillna(""None"", inplace=True) all_data['ConfirmedCases'].fillna(0, inplace=True) all_data['Fatalities'].fillna(0, inplace=True) all_data['Id'].fillna(-1, inplace=True) all_data['ForecastId'].fillna(-1, inplace=True) display(all_data) display(all_data.loc[all_data['Date'] == '2020-03-19'])'",Yes,3,8.0 "missings_count = {col:all_data[col].isnull().sum() for col in all_data.columns} missings = pd.DataFrame.from_dict(missings_count, orient='index') print(missings.nlargest(30, 0))",No,5,39.0 "def calculate_trend(df, lag_list, column): for lag in lag_list: trend_column_lag = ""Trend_"" + column + ""_"" + str(lag) df[trend_column_lag] = (df[column]-df[column].shift(lag, fill_value=-999))/df[column].shift(lag, fill_value=0) return df def calculate_lag(df, lag_list, column): for lag in lag_list: column_lag = column + ""_"" + str(lag) df[column_lag] = df[column].shift(lag, fill_value=0) return df ts = time.time() all_data = calculate_lag(all_data, range(1,7), 'ConfirmedCases') all_data = calculate_lag(all_data, range(1,7), 'Fatalities') all_data = calculate_trend(all_data, range(1,7), 'ConfirmedCases') all_data = calculate_trend(all_data, range(1,7), 'Fatalities') all_data.replace([np.inf, -np.inf], 0, inplace=True) all_data.fillna(0, inplace=True) print(""Time spent: "", time.time()-ts)'",No,3,8.0 "all_data[all_data['Country_Region']=='Spain'].iloc[40:50][['Id', 'Province_State', 'Country_Region', 'Date', 'ConfirmedCases', 'Fatalities', 'ForecastId', 'Day_num', 'ConfirmedCases_1', 'ConfirmedCases_2', 'ConfirmedCases_3', 'Fatalities_1', 'Fatalities_2', 'Fatalities_3']]",No,5,14.0 "# Load countries data file world_population = pd.read_csv(""/kaggle/input/population-by-country-2020/population_by_country_2020.csv"") # Select desired columns and rename some of them world_population = world_population[['Country (or dependency)', 'Population (2020)', 'Density (P/Km)', 'Land Area (Km)', 'Med. Age', 'Urban Pop %']] world_population.columns = ['Country (or dependency)', 'Population (2020)', 'Density', 'Land Area', 'Med Age', 'Urban Pop'] # Replace United States by US world_population.loc[world_population['Country (or dependency)']=='United States', 'Country (or dependency)'] = 'US' # Remove the % character from Urban Pop values world_population['Urban Pop'] = world_population['Urban Pop'].str.rstrip('%') # Replace Urban Pop and Med Age ""N.A"" by their respective modes, then transform to int world_population.loc[world_population['Urban Pop']=='N.A.', 'Urban Pop'] = int(world_population.loc[world_population['Urban Pop']!='N.A.', 'Urban Pop'].mode()[0]) world_population['Urban Pop'] = world_population['Urban Pop'].astype('int16') world_population.loc[world_population['Med Age']=='N.A.', 'Med Age'] = int(world_population.loc[world_population['Med Age']!='N.A.', 'Med Age'].mode()[0]) world_population['Med Age'] = world_population['Med Age'].astype('int16') print(""Cleaned country details dataset"") display(world_population) # Now join the dataset to our previous DataFrame and clean missings (not match in left join)- label encode cities print(""Joined dataset"") all_data = all_data.merge(world_population, left_on='Country_Region', right_on='Country (or dependency)', how='left') all_data[['Population (2020)', 'Density', 'Land Area', 'Med Age', 'Urban Pop']] = all_data[['Population (2020)', 'Density', 'Land Area', 'Med Age', 'Urban Pop']].fillna(0) display(all_data) print(""Encoded dataset"") # Label encode countries and provinces. Save dictionary for exploration purposes all_data.drop('Country (or dependency)', inplace=True, axis=1) all_data['Country_Region'] = le.fit_transform(all_data['Country_Region']) number_c = all_data['Country_Region'] countries = le.inverse_transform(all_data['Country_Region']) country_dict = dict(zip(countries, number_c)) all_data['Province_State'] = le.fit_transform(all_data['Province_State']) number_p = all_data['Province_State'] province = le.inverse_transform(all_data['Province_State']) province_dict = dict(zip(province, number_p)) display(all_data)'",Yes,1,45.0 "# Filter selected features data = all_data.copy() features = ['Id', 'ForecastId', 'Country_Region', 'Province_State', 'ConfirmedCases', 'Fatalities', 'Day_num'] data = data[features] # Apply log transformation to all ConfirmedCases and Fatalities columns, except for trends data[['ConfirmedCases', 'Fatalities']] = data[['ConfirmedCases', 'Fatalities']].astype('float64') data[['ConfirmedCases', 'Fatalities']] = data[['ConfirmedCases', 'Fatalities']].apply(lambda x: np.log1p(x)) # Replace infinites data.replace([np.inf, -np.inf], 0, inplace=True) # Split data into train/test def split_data(data): # Train set x_train = data[data.ForecastId == -1].drop(['ConfirmedCases', 'Fatalities'], axis=1) y_train_1 = data[data.ForecastId == -1]['ConfirmedCases'] y_train_2 = data[data.ForecastId == -1]['Fatalities'] # Test set x_test = data[data.ForecastId != -1].drop(['ConfirmedCases', 'Fatalities'], axis=1) # Clean Id columns and keep ForecastId as index x_train.drop('Id', inplace=True, errors='ignore', axis=1) x_train.drop('ForecastId', inplace=True, errors='ignore', axis=1) x_test.drop('Id', inplace=True, errors='ignore', axis=1) x_test.drop('ForecastId', inplace=True, errors='ignore', axis=1) return x_train, y_train_1, y_train_2, x_test # Ridge replace of the Linear regression model def ridge_reg(X_train, Y_train, X_test): # Create Ridge regression object #regr = Ridge() # commit 2 #regr = RidgeCV(cv=5) # commit 4 #regr = Ridge(alpha=10) # commit 5 regr = Ridge(alpha=10) # now # Train the model using the training sets regr.fit(X_train, Y_train) # Make predictions using the testing set y_pred = regr.predict(X_test) return regr, y_pred # Submission function def get_submission(s, df, target1, target2): prediction_1 = df[target1] prediction_2 = df[target2] # Submit predictions prediction_1 = [int(item) for item in list(map(round, prediction_1))] prediction_2 = [int(item) for item in list(map(round, prediction_2))] submission = pd.DataFrame({ ""ForecastId"": df['ForecastId'].astype('int32'), ""ConfirmedCases"": prediction_1, ""Fatalities"": prediction_2 }) submission.to_csv(s + '.csv', index=False)'",Yes,2,7.0 "# Select train (real) data from March 1 to March 22nd dates_list = ['2020-03-01', '2020-03-02', '2020-03-03', '2020-03-04', '2020-03-05', '2020-03-06', '2020-03-07', '2020-03-08', '2020-03-09', '2020-03-10', '2020-03-11','2020-03-12','2020-03-13','2020-03-14','2020-03-15','2020-03-16','2020-03-17','2020-03-18', '2020-03-19','2020-03-20','2020-03-21','2020-03-22','2020-03-23', '2020-03-24', '2020-03-25', '2020-03-26', '2020-03-27', '2020-03-28', '2020-03-29', '2020-03-30', '2020-03-31']",No,4,14.0 all_data.loc[all_data['Country_Region']==country_dict['Spain']][40:65],No,5,14.0 "def plot_rreg_basic_country(data, country_name, dates_list, day_start, shift): data_country = data[data['Country_Region']==country_dict[country_name]] data_country = data_country.loc[data_country['Day_num']>=day_start] X_train, Y_train_1, Y_train_2, X_test = split_data(data_country) model, pred = ridge_reg(X_train, Y_train_1, X_test) # Create a df with both real cases and predictions (predictions starting on March 12th) X_train_check = X_train.copy() X_train_check['Target'] = Y_train_1 X_test_check = X_test.copy() X_test_check['Target'] = pred X_final_check = pd.concat([X_train_check, X_test_check]) # Select predictions from March 1st to March 25th predicted_data = X_final_check.loc[(X_final_check['Day_num'].isin(list(range(day_start, day_start+len(dates_list)))))].Target real_data = train.loc[(train['Country_Region']==country_name) & (train['Date'].isin(dates_list))]['ConfirmedCases'] dates_list_num = list(range(0,len(dates_list))) # Plot results fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,6)) ax1.plot(dates_list_num, np.expm1(predicted_data)) ax1.plot(dates_list_num, real_data) ax1.axvline(17-shift, linewidth=2, ls = ':', color='grey', alpha=0.5) ax1.legend(['Predicted cases', 'Actual cases', 'Train-test split'], loc='upper left') ax1.set_xlabel(""Day count (from March "" + str(1+shift) + "" to March 25th)"") ax1.set_ylabel(""Confirmed Cases"") ax2.plot(dates_list_num, predicted_data) ax2.plot(dates_list_num, np.log1p(real_data)) ax2.axvline(17-shift, linewidth=2, ls = ':', color='grey', alpha=0.5) ax2.legend(['Predicted cases', 'Actual cases', 'Train-test split'], loc='upper left') ax2.set_xlabel(""Day count (from March "" + str(1+shift) + "" to March 30th)"") ax2.set_ylabel(""Log Confirmed Cases"") plt.suptitle((""ConfirmedCases predictions based on Log-Lineal Regression for ""+country_name)) # Filter Spain, run the Linear Regression workflow country_name = ""Spain"" march_day = 0 day_start = 39+march_day dates_list2 = dates_list[march_day:] plot_rreg_basic_country(data, country_name, dates_list2, day_start, march_day)'",Yes,3,56.0 "# Filter Spain, run the Linear Regression workflow country_name = ""Spain"" march_day = 15 day_start = 39+march_day dates_list2 = dates_list[march_day:] plot_rreg_basic_country(data, country_name, dates_list2, day_start, march_day)",No,4,56.0 "# Filter Italy, run the Linear Regression workflow country_name = ""Italy"" march_day = 0 day_start = 39+march_day dates_list2 = dates_list[march_day:] plot_rreg_basic_country(data, country_name, dates_list2, day_start, march_day)",No,5,33.0 "# Filter Italy, run the Linear Regression workflow country_name = ""Italy"" march_day = 15 day_start = 39+march_day dates_list2 = dates_list[march_day:] plot_rreg_basic_country(data, country_name, dates_list2, day_start, march_day)",No,2,56.0 "# Filter Germany, run the Linear Regression workflow country_name = ""Germany"" march_day = 0 day_start = 39+march_day dates_list2 = dates_list[march_day:] plot_rreg_basic_country(data, country_name, dates_list2, day_start, march_day)",No,5,56.0 "# Filter Germany, run the Linear Regression workflow country_name = ""Germany"" march_day = 15 day_start = 39+march_day dates_list2 = dates_list[march_day:] plot_rreg_basic_country(data, country_name, dates_list2, day_start, march_day)",No,5,33.0 "# Filter Albania, run the Linear Regression workflow country_name = ""Albania"" march_day = 0 day_start = 39+march_day dates_list2 = dates_list[march_day:] plot_rreg_basic_country(data, country_name, dates_list2, day_start, march_day)",No,5,33.0 "# Filter Albania, run the Linear Regression workflow country_name = ""Albania"" march_day = 15 day_start = 39+march_day dates_list2 = dates_list[march_day:] plot_rreg_basic_country(data, country_name, dates_list2, day_start, march_day)",No,5,56.0 "# Filter Andorra, run the Linear Regression workflow country_name = ""Andorra"" shift = 0 day_start = 39+shift dates_list2 = dates_list[shift:] plot_rreg_basic_country(data, country_name, dates_list2, day_start, shift)",No,5,33.0 "# Filter Andorra, run the Linear Regression workflow country_name = ""Andorra"" shift = 7 day_start = 39+shift dates_list2 = dates_list[shift:] plot_rreg_basic_country(data, country_name, dates_list2, day_start, shift)",No,5,33.0 "ts = time.time() def ridge_reg_basic_all_countries(data, day_start): data2 = data.loc[data.Day_num >= day_start] # Set the dataframe where we will update the predictions data_pred = data[data.ForecastId != -1][['Country_Region', 'Province_State', 'Day_num', 'ForecastId']] data_pred = data_pred.loc[data_pred['Day_num']>=day_start] data_pred['Predicted_ConfirmedCases'] = [0]*len(data_pred) data_pred['Predicted_Fatalities'] = [0]*len(data_pred) print(""Currently running Logistic Regression for all countries"") # Main loop for countries for c in data2['Country_Region'].unique(): # List of provinces provinces_list = data2[data2['Country_Region']==c]['Province_State'].unique() # If the country has several Province/State informed if len(provinces_list)>1: for p in provinces_list: data_cp = data2[(data2['Country_Region']==c) & (data2['Province_State']==p)] X_train, Y_train_1, Y_train_2, X_test = split_data(data_cp) model_1, pred_1 = ridge_reg(X_train, Y_train_1, X_test) model_2, pred_2 = ridge_reg(X_train, Y_train_2, X_test) data_pred.loc[((data_pred['Country_Region']==c) & (data2['Province_State']==p)), 'Predicted_ConfirmedCases'] = pred_1 data_pred.loc[((data_pred['Country_Region']==c) & (data2['Province_State']==p)), 'Predicted_Fatalities'] = pred_2 # No Province/State informed else: data_c = data2[(data2['Country_Region']==c)] X_train, Y_train_1, Y_train_2, X_test = split_data(data_c) model_1, pred_1 = ridge_reg(X_train, Y_train_1, X_test) model_2, pred_2 = ridge_reg(X_train, Y_train_2, X_test) data_pred.loc[(data_pred['Country_Region']==c), 'Predicted_ConfirmedCases'] = pred_1 data_pred.loc[(data_pred['Country_Region']==c), 'Predicted_Fatalities'] = pred_2 # Apply exponential transf. and clean potential infinites due to final numerical precision data_pred[['Predicted_ConfirmedCases', 'Predicted_Fatalities']] = data_pred[['Predicted_ConfirmedCases', 'Predicted_Fatalities']].apply(lambda x: np.expm1(x)) data_pred.replace([np.inf, -np.inf], 0, inplace=True) return data_pred day_start = 52 data_pred = ridge_reg_basic_all_countries(data, day_start) get_submission('submission', data_pred, 'Predicted_ConfirmedCases', 'Predicted_Fatalities') print(""Process finished in "", round(time.time() - ts, 2), "" seconds"")'",Yes,3,48.0 "import numpy as np import pandas as pd import os sub = pd.read_csv(""../input/submissions/submission.csv"") sub.to_csv(""submission.csv"", index=False)",No,3,45.0 "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) Predictions = pd.read_csv('../input/covid-predictions/COVID_predictions.csv') Predictions.to_csv('submission.csv',index=False)",No,4,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the read-only ""../input/"" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session # You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session sub = pd.read_csv(""/kaggle/input/inputsubscsv/subs.csv"") sub.to_csv(""./submission.csv"", index=False)'",No,4,88.0 "# Data Handling import pandas as pd import numpy as np import math import scipy.stats as sps #from scipy import stats, integrate from time import time # sklearn and models from sklearn import preprocessing, ensemble, metrics, feature_selection, model_selection, pipeline import xgboost as xgb #plotting and display from IPython.display import display from matplotlib import pyplot",No,5,22.0 "# create date parser dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d') # create data type converters dtype_map_weather = dict(Station = 'str') dtype_map_test_train = dict(Block = 'str', Street = 'str') # read data into PANDAS DataFrames with date parsing test = pd.read_csv('../input/test.csv', parse_dates=['Date'], date_parser=dateparse, dtype= dtype_map_test_train) train = pd.read_csv('../input/train.csv', parse_dates=['Date'], date_parser=dateparse, dtype= dtype_map_test_train) weather = pd.read_csv('../input/weather.csv', parse_dates=['Date'], date_parser=dateparse, dtype= dtype_map_weather) sample_sub = pd.read_csv('../input/sampleSubmission.csv')",No,4,45.0 "print('Train') display(train.info()) print('Test') display(test.info())",No,5,40.0 "print('Weather') display(weather.info())",No,5,40.0 "# weather weather_exclude = ['Dewpoint', 'WetBulb', 'CodeSum', 'Depth', 'Water1', 'SnowFall', 'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed','DewPoint'] weather_cols = [col for col in weather.columns if col not in weather_exclude] weather = weather[weather_cols] # train train_exclude = ['Address', 'AddressNumberAndStreet', 'AddressAccuracy', 'NumMosquitos'] train_cols = [col for col in train.columns if col not in train_exclude] train = train[train_cols] # test test_exclude = ['Address', 'AddressNumberAndStreet', 'AddressAccuracy', 'Id'] test_cols = [col for col in test.columns if col not in test_exclude] test = test[test_cols]",No,4,10.0 weather.info(),No,5,40.0 "print('Weather') display(weather.head()) print('Train') display(train.head())",No,5,41.0 "# what species have been detected (note that according to the CDC each # of these species can carry WNV) set(train.Species)",No,5,57.0 "# does this correspond to the test set set(test.Species) # it looks like there is another category",No,5,57.0 train.groupby('Species').sum().WnvPresent,No,5,60.0 "miss_weather = ['M', '-'] trace_weather = ['T']",No,4,10.0 cols_not_date = [col for col in weather.columns if col != 'Date'],No,5,14.0 "weather[cols_not_date].apply(pd.value_counts, axis=1)[miss_weather + trace_weather].fillna(0).sum()",No,5,17.0 "# Both stations check.loc[['M', '-', 'T']]/(len(weather)) * 100",No,4,41.0 "# Station 1 check_stat1.loc[['M', '-', 'T']]/(len(weather)) * 100",No,4,41.0 "# Station 2() check_stat2.loc[['M', '-', 'T']]/(len(weather)) * 100",No,4,41.0 "weather = weather.replace('M', np.NaN) weather = weather.replace('-', np.NaN) weather = weather.replace('T', 0.005) # very small amounts of rain can impact mosquito hatches weather.Tmax = weather.Tmax.fillna(method = 'ffill') weather.Tmin = weather.Tmin.fillna(method = 'ffill') weather.Depart = weather.Depart.fillna(method = 'ffill') weather.Heat = weather.Heat.fillna(method = 'ffill') weather.Cool = weather.Cool.fillna(method = 'ffill') weather.PrecipTotal = weather.PrecipTotal.fillna(method = 'ffill')",No,4,17.0 "# convert datatpypes to_numeric = ['Tmax','Tmin','Tavg', 'Depart', 'Heat', 'Cool', 'PrecipTotal'] for col in to_numeric: weather[col]= pd.to_numeric(weather[col])",No,5,16.0 "weather.Sunrise = weather.Sunrise.fillna(method = 'ffill') weather.Sunset = weather.Sunset.fillna(method = 'ffill')",No,5,17.0 "# sunset has entries where instead of incrementing to the next hour after xx59 it incremented to xx60 # This causes an exception, let's take a look counter = 0 tracker = [] for index, val in enumerate(weather.Sunset): try: pd.to_datetime(val, format = '%H%M').time() except: counter += 1 tracker.append((index, val, val[2:], counter)) print(tracker[-1]) # there are 48 exceptions",No,4,16.0 "# time conversion lambda function time_func = lambda x: pd.Timestamp(pd.to_datetime(x, format = '%H%M'))",No,5,16.0 weather.Sunrise = weather.Sunrise.apply(time_func),No,5,8.0 weather.Sunset = weather.Sunset.apply(time_func),No,5,8.0 "# what is the range of values for sunrise and sunset (in hours) minutes= (weather.Sunset - weather.Sunrise).astype('timedelta64[m]')",No,5,16.0 "#create a DayLength column with minute level precsion weather['DayLength_MPrec'] = (weather.Sunset - weather.Sunrise).astype('timedelta64[m]')/60",No,5,16.0 "#create a DayLength column with rounded to the nearest hour weather['DayLength_NearH'] = np.round(((weather.Sunset - weather.Sunrise).astype('timedelta64[m]')/60).values)",No,5,8.0 "# length of night with minute level precision weather['NightLength_MPrec']= 24.0 - weather.DayLength_MPrec",No,5,8.0 "# lenght of night rounded to nearest hour weather['NightLength_NearH']= 24.0 - weather.DayLength_NearH",No,5,8.0 "# function to calculate sunset and sunrise times in hours hours_RiseSet_func = lambda x: x.minute/60.0 + float(x.hour)",No,5,84.0 "# sunrise in hours weather['Sunrise_hours'] = weather.Sunrise.apply(hours_RiseSet_func)",No,5,8.0 "# sunset in hours weather['Sunset_hours'] = weather.Sunset.apply(hours_RiseSet_func)",No,5,8.0 "station_blend = pd.DataFrame((station_1.values + station_2.values)/2, columns= blended_cols)",No,5,12.0 "extract_2 = weather[weather.Station == '2'].reset_index(drop = True) extract_2.head()",No,4,14.0 "extract_1 = weather[weather.Station == '1'].reset_index(drop = True) extract_1.head()",No,5,14.0 "joined_1 = extract_1.join(station_blend) joined_2 = extract_2.join(station_blend)",No,5,32.0 "weather_blend = pd.concat([joined_1, joined_2])",No,5,11.0 weather_blend.info(),No,5,40.0 "month_func = lambda x: x.month day_func= lambda x: x.day day_of_year_func = lambda x: x.dayofyear week_of_year_func = lambda x: x.week # train train['month'] = train.Date.apply(month_func) train['day'] = train.Date.apply(day_func) train['day_of_year'] = train.Date.apply(day_of_year_func) train['week'] = train.Date.apply(week_of_year_func) # test test['month'] = test.Date.apply(month_func) test['day'] = test.Date.apply(day_func) test['day_of_year'] = test.Date.apply(day_of_year_func) test['week'] = test.Date.apply(week_of_year_func)",No,4,8.0 "# remove sunrise and sunset since we have extracted critical information into other fields weather_blend = weather_blend.drop(['Sunrise', 'Sunset'], axis= 1)",No,5,10.0 "train = train.merge(weather_blend, on='Date') test = test.merge(weather_blend, on='Date')",No,5,32.0 "weather_blend.ix[:,:12].describe()",No,4,40.0 "weather_blend.ix[:,12:].describe()",No,4,40.0 "# split the data into two dataframes by station train_station_1= train[train.Station == '1'] train_station_2= train[train.Station == '2'] test_station_1= test[test.Station == '1'] test_station_2= test[test.Station == '2']",No,5,14.0 "# set up a merge for stations 1 and 2 # keep unique cols from station 2 keep_cols = ['Date', u'Tmax', u'Tmin', u'Tavg',u'PrecipTotal'] train_station_2 = train_station_2[keep_cols] test_station_2 = test_station_2[keep_cols] # rename cols with prefix prefix_s2 = 'stat_2_' rename_cols_s2 = [prefix_s2 + col for col in train_station_2.columns] train_station_2.columns = rename_cols_s2 test_station_2.columns = rename_cols_s2",No,3,61.0 "# drop cols from station 1 that won't be used in model drop_cols = ['Heat', 'Cool', 'Depart', 'NightLength_MPrec', 'NightLength_NearH', 'blended_Depart', 'blended_Heat', 'blended_Cool'] train_station_1 = train_station_1.drop(drop_cols, axis= 1) test_station_1 = test_station_1.drop(drop_cols, axis= 1) ",No,5,10.0 "# raname uniqe station 1 columns prefix_s1 = 'stat_1_' rename_cols_s1 = [prefix_s1 + col for col in keep_cols] cols_to_rename= [col for col in train_station_1.columns if col in keep_cols] # setup name mapping s1_name_map = dict(zip(cols_to_rename, rename_cols_s1)) train_station_1 = train_station_1.rename(columns= s1_name_map) test_station_1 = test_station_1.rename(columns= s1_name_map)",No,4,61.0 "# concat (outer join) train_station_1 = train_station_1.reset_index(drop= True) train_station_2 = train_station_2.reset_index(drop = True) train_merge = pd.concat([train_station_1, train_station_2], axis= 1) test_station_1 = test_station_1.reset_index(drop= True) test_station_2 = test_station_2.reset_index(drop = True) test_merge = pd.concat([test_station_1, test_station_2], axis= 1)",No,4,11.0 "# remove dates train_merge = train_merge.drop(['stat_1_Date', 'stat_2_Date'], axis = 1) test_merge = test_merge.drop(['stat_1_Date', 'stat_2_Date' ], axis = 1)",No,5,10.0 "# add lat and long integer columns train_merge['Lat_int'] = train_merge.Latitude.astype(int) train_merge['Long_int'] = train_merge.Longitude.astype(int) test_merge['Lat_int'] = test_merge.Latitude.astype(int) test_merge['Long_int'] = test_merge.Longitude.astype(int)",No,5,16.0 "#train_merge= train_merge.drop(['Street', 'Trap', 'Station'], axis= 1) #test_merge= test_merge.drop(['Street', 'Trap', 'Station'], axis= 1) train_merge= train_merge.drop('Station', axis= 1) test_merge= test_merge.drop('Station', axis= 1)",No,5,10.0 len(train_merge.columns),No,5,40.0 len(test_merge.columns),No,5,40.0 "test_merge= test_merge.drop(unique_test_cols, axis= 1)",No,5,10.0 "clf = ensemble.RandomForestClassifier(n_estimators=1000, min_samples_split= 2, random_state= 42) clf.fit(train_merge, labels)",No,5,7.0 "# create predictions and submission file predictions_randfor = clf.predict_proba(test_merge)[:,1]",No,5,48.0 "# fit model no training data xgbc = xgb.XGBClassifier(seed= 42) xgbc.fit(train_merge, labels) # feature importance #print(xgb.feature_importances_) # plot feature importance fig, ax = pyplot.subplots(figsize=(10, 15)) xgb.plot_importance(xgbc, ax=ax) #pyplot.show()",No,4,7.0 xgbc.get_fscore(),No,5,79.0 "# feature importance xgbc.get_fscore() #print(xgbc.feature_importances_)",No,5,84.0 "def calc_roc_auc(y, predict_probs): """""" Function accepts labels (matrix y) and predicted probabilities Function calculates fpr (false positive rate), tpr (true postivies rate), thresholds and auc (area under the roc curve) Function returns auc """""" fpr, tpr, thresholds = metrics.roc_curve(y, predict_probs) roc_auc = metrics.auc(fpr, tpr) return roc_auc",No,5,84.0 "train_split, val_split, label_train_split, label_val_split = model_selection.train_test_split(train_merge, labels, test_size = 0.33, random_state = 42, stratify= labels)",No,5,13.0 train_merge.shape,No,5,58.0 "# initialize and fit model xgb_clf= xgb.XGBClassifier(seed= 42) xgb_clf.fit(sfm_train, labels)",No,4,7.0 "sfm_test = sfm.transform(test_merge) predictions_xgb = xgb_clf.predict_proba(sfm_test)[:,1]",No,4,48.0 "X_train= train_split X_test= val_split y_train= label_train_split y_test= label_val_split model= xgb.XGBClassifier(seed= 42) eval_set = [(X_train, y_train), (X_test, y_test)] model.fit(X_train, y_train, eval_metric=""auc"", eval_set=eval_set, verbose=True)",Yes,3,7.0 "results = model.evals_result() print(results)",No,5,28.0 "model.fit(X_train, y_train, eval_metric=[""auc"", ""logloss"", ""error""], eval_set=eval_set) # retrieve performance metrics results = model.evals_result() epochs = len(results['validation_0']['auc']) x_axis = range(0, epochs) # plot auc fig, ax = pyplot.subplots() ax.plot(x_axis, results['validation_0']['auc'], label='Train') ax.plot(x_axis, results['validation_1']['auc'], label='Test') ax.legend() pyplot.ylabel('AUC') pyplot.title('XGBoost AUC by Epoch') pyplot.show() # plot logloss fig, ax = pyplot.subplots() ax.plot(x_axis, results['validation_0']['logloss'], label='Train') ax.plot(x_axis, results['validation_1']['logloss'], label='Test') ax.legend() pyplot.ylabel('Logloss') pyplot.title('XGBoost Logloss by Epoch') pyplot.show() # plot error fig, ax = pyplot.subplots() ax.plot(x_axis, results['validation_0']['error'], label='Train') ax.plot(x_axis, results['validation_1']['error'], label='Test') ax.legend() pyplot.ylabel('Error') pyplot.title('XGBoost Error by Epoch') pyplot.show()'",No,4,35.0 "eval_set = [(X_test, y_test)] model.fit(X_train, y_train, eval_metric=[""auc""], eval_set=eval_set, early_stopping_rounds=10) results = model.evals_result() print(results)",Yes,3,7.0 "# Utility function to report best scores def report(results, n_top=3): for i in range(1, n_top + 1): candidates = np.flatnonzero(results['rank_test_score'] == i) for candidate in candidates: print(""Model with rank: {0}"".format(i)) print(""Mean validation score: {0:.3f} (std: {1:.3f})"".format( results['mean_test_score'][candidate], results['std_test_score'][candidate])) print(""Parameters: {0}"".format(results['params'][candidate])) print("""")'",No,4,1.0 "#n_estimators_dist= np.random.randint(1, 500)# number of trees, could use a discrete list or np.random.exponential(scale=0.1, size= 100) #colsample_bytree_dist= np.random.uniform(0.2,0.6) # should be 0.3 - 0.5 #max_depth_dist = np.random.randint(2, 12) # typical values 3 - 10 #learning_rate_dist= np.random.uniform(0.01, 0.3) # default 0.3, typical values 0.01 - 0.2 #learning_rate_dist= scipy.stats.expon(scale=100) #learning_rate_dist= 10. ** np.arange(-3, -2) n_estimators_dist= sps.randint(1, 300) learning_rate_dist = [0.01, 0.02, 0.05, 0.1, 0.2, 0.3]",No,5,5.0 "#cv = model_selection.StratifiedShuffleSplit(n_splits = 10, random_state = 42) param_dist = dict(learning_rate= learning_rate_dist, n_estimators= n_estimators_dist) # run randomized search n_iter_search = 20 random_search = model_selection.RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter_search, scoring= 'roc_auc') start = time() random_search.fit(X_train, y_train) print(""RandomizedSearchCV took %.2f seconds for %d candidates"" "" parameter settings."" % ((time() - start), n_iter_search)) report(random_search.cv_results_)'",Yes,3,6.0 "sample_sub['WnvPresent'] = predictions_xgb sample_sub.to_csv('sub_xgb.csv', index=False) #sample_sub['WnvPresent'] = predictions_randfor #sample_sub.to_csv('sub_randfor.csv', index=False)",No,4,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from matplotlib import pyplot as plt plt.style.use('fivethirtyeight') # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory from sklearn.metrics import f1_score, log_loss, precision_score, confusion_matrix, classification_report import os print(os.listdir(""../input"")) # Any results you write to the current directory are saved as output.",No,4,88.0 "test_stage_1 = pd.read_csv(""../input/test_stage_1.tsv"", sep=""\\t"")'",No,5,45.0 test_stage_1[0:5],No,5,41.0 "# assigning the GAP dev data as test data test_df = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv"", delimiter='\\t') # assigning the GAP test data as train data train_df = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv"", delimiter='\\t') valid_df = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv"", delimiter='\\t')'",No,5,45.0 "# using the full set of training and validation data train_df = pd.concat([train_df,valid_df])",No,5,11.0 "def scrape_url(url): ''' get the title of the wikipedia page and replace ""_"" with white space ''' return url[29:].lower().replace(""_"","" "") def check_name_in_string(name,string): ''' check whether the name string is a substring of another string (i.e. wikipedia title) ''' return name.lower() in string def predict_coref(df): pred =[] for index, row in df.iterrows(): wiki_title = scrape_url(row[""URL""]) if (check_name_in_string(row[""A""],wiki_title)): pred.append(""A"") else: if (check_name_in_string(row[""B""],wiki_title)): pred.append(""B"") else: pred.append(""NEITHER"") return pred train_pred = predict_coref(train_df) test_pred = predict_coref(test_df)'",No,5,53.0 "gold_train = [] for index, row in train_df.iterrows(): if (row[""A-coref""]): gold_train.append(""A"") else: if (row[""B-coref""]): gold_train.append(""B"") else: gold_train.append(""NEITHER"") gold_test = [] for index, row in test_df.iterrows(): if (row[""A-coref""]): gold_test.append(""A"") else: if (row[""B-coref""]): gold_test.append(""B"") else: gold_test.append(""NEITHER"") ",No,3,12.0 " print(f1_score( gold_train, train_pred, average='micro')) print(classification_report( gold_train, train_pred)) print(confusion_matrix(gold_train, train_pred))",No,5,28.0 "def prec_prob(gold, pred, test): ''' Using the training set to determine the precision by class and assigning it to the test data set ''' scores = [] precision = precision_score(gold, pred, average=None, labels=['A','B','NEITHER']) A_prec = precision[0] B_prec = precision[1] Neither_prec = precision[2] for ante in test: if (ante == 'A'): scores.append([A_prec, B_prec*B_prior, Neither_prec*Neither_prior]) else: if (ante =='B'): scores.append([A_prec*A_prior, B_prec, Neither_prec*Neither_prior]) else: scores.append([A_prec*A_prior,B_prec*B_prior,Neither_prec]) return scores",No,3,49.0 " scores_train = prec_prob(gold_train, train_pred, train_pred) log_loss(gold_train,scores_train)",No,4,49.0 " scores_test = prec_prob(gold_train, train_pred, test_pred) log_loss(gold_test,scores_test)",No,4,49.0 "sample_submission = pd.read_csv(""../input/sample_submission_stage_1.csv"")",No,5,45.0 "sample_submission[['A','B','NEITHER']] = scores_test",No,5,55.0 "sample_submission.to_csv(""submission.csv"", index=False)",No,5,25.0 "#Gender classification using universal data set #import libaries import numpy as np import pandas as pd #ML Packages from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction import DictVectorizer #load teh data set data_set = pd.read_csv(""../input/names-dataset/names_dataset.csv"") xfeatures = data_set[""name""] #feature extraction cv = CountVectorizer() X = cv.fit_transform(xfeatures) data_set.sex.replace({'F':0,'M':1},inplace=True) #features X = X #label data_set.drop_duplicates(keep=""first"", inplace=True) y =data_set.sex from collections import Counter print(""ty"",Counter(y)) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # Naive Bayes Classifier from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB() clf.fit(X, y) def Predict(data): test_name = [data] vector = cv.transform(test_name).toarray() result = clf.predict(vector)[0] return result '",Yes,3,7.0 "train_set = pd.read_csv(""../input/gendered-pronoun-resolution/test_stage_1.tsv"", encoding=""utf-8"", error_bad_lines=False, delimiter='\\t') train_set[""A""] = train_set[""A""].apply(Predict) train_set[""B""] = train_set[""B""].apply(Predict) '",No,3,45.0 "train_set = train_set[[""ID"", ""A"", 'B'] ]'",No,5,10.0 " train_set[""NEITHER""] = abs(train_set[""NEITHER""].astype(int))",No,5,16.0 "train_set.to_csv('sub.csv', index=False)",No,5,25.0 "b""import numpy as np\nimport pandas as pd\nimport spacy\nfrom spacy import displacy\nnlp = spacy.load('en_core_web_sm')\nimport nltk\nfrom sklearn import *\n\ntest = pd.read_csv('../input/test_stage_2.tsv', delimiter='\\t').rename(columns={'A': 'A_Noun', 'B': 'B_Noun'})\nsub = pd.read_csv('../input/sample_submission_stage_2.csv')\ntest.shape, sub.shape""",No,3,45.0 "gh_test = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv"", delimiter='\\t') #Adding gh_train for stage two submission with new test data, will also add any new data available via Kaggle Competition data for stage2 :) gh_train = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv"", delimiter='\\t') gh_valid = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv"", delimiter='\\t') train = pd.concat((gh_test, gh_train, gh_valid)).rename(columns={'A': 'A_Noun', 'B': 'B_Noun'}).reset_index(drop=True) train.shape'",No,4,45.0 "def name_replace(s, r1, r2): s = str(s).replace(r1,r2) for r3 in r1.split(' '): s = str(s).replace(r3,r2) return s def get_features(df): df['section_min'] = df[['Pronoun-offset', 'A-offset', 'B-offset']].min(axis=1) df['Pronoun-offset2'] = df['Pronoun-offset'] + df['Pronoun'].map(len) df['A-offset2'] = df['A-offset'] + df['A_Noun'].map(len) df['B-offset2'] = df['B-offset'] + df['B_Noun'].map(len) df['A-dist_abs'] = (df['Pronoun-offset'] - df['A-offset']).abs() df['B-dist_abs'] = (df['Pronoun-offset'] - df['B-offset']).abs() df['A-dist'] = (df['Pronoun-offset'] - df['A-offset']) df['B-dist'] = (df['Pronoun-offset'] - df['B-offset']) df['section_max'] = df[['Pronoun-offset2', 'A-offset2', 'B-offset2']].max(axis=1) df['A_max'] = (df['A-offset2'] == df['section_max']).astype(int) df['A_min'] = (df['A-offset2'] == df['section_min']).astype(int) df['B_max'] = (df['B-offset2'] == df['section_max']).astype(int) df['B_min'] = (df['B-offset2'] == df['section_min']).astype(int) df['wc'] = df.apply(lambda r: len(str(r['Text'][r['section_min']: r['section_max']]).split(' ')), axis=1) #df['Text'] = df.apply(lambda r: r['Text'][: r['Pronoun-offset']] + 'pronountarget' + r['Text'][r['Pronoun-offset'] + len(str(r['Pronoun'])): ], axis=1) df['Text'] = df.apply(lambda r: name_replace(r['Text'], r['A_Noun'], 'subjectone'), axis=1) df['Text'] = df.apply(lambda r: name_replace(r['Text'], r['B_Noun'], 'subjecttwo'), axis=1) return(df) train = get_features(train) test = get_features(test)",Yes,5,8.0 "def get_nlp_features(s, w): doc = nlp(str(s)) tokens = pd.DataFrame([[token.text, token.dep_] for token in doc], columns=['text', 'dep']) return len(tokens[((tokens['text']==w) & (tokens['dep']=='poss'))]) train['A-poss'] = train['Text'].map(lambda x: get_nlp_features(x, 'subjectone')) train['B-poss'] = train['Text'].map(lambda x: get_nlp_features(x, 'subjecttwo')) test['A-poss'] = test['Text'].map(lambda x: get_nlp_features(x, 'subjectone')) test['B-poss'] = test['Text'].map(lambda x: get_nlp_features(x, 'subjecttwo'))",No,4,8.0 "train = train.rename(columns={'A-coref':'A', 'B-coref':'B'}) train['A'] = train['A'].astype(int) train['B'] = train['B'].astype(int) train['NEITHER'] = 1.0 - (train['A'] + train['B'])",No,4,16.0 "test_stage_1 = pd.read_csv(""../input/test_stage_1.tsv"", sep=""\\t"") test_stage_2 = pd.read_csv(""../input/test_stage_2.tsv"", sep=""\\t"")'",No,5,45.0 "gap_test = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv"", delimiter='\\t') gap_valid = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv"", delimiter='\\t')'",No,5,45.0 gap_test[0:5],No,5,41.0 "def get_prior(df): # count how many times neither antecedent is correct for the pronoun Neither_count = len(df) - sum(df[""A-coref""] | df[""B-coref""]) # count the A coreferences A_count = sum(df[""A-coref""]) # count the B coreferences B_count = sum(df[""B-coref""]) # total number of samples test_total = len(df) # compute the prior probabilities of the three classes Neither_prior = Neither_count/test_total A_prior = A_count/test_total B_prior = B_count/test_total print(""Prior probabilities:"") print(""Neither: ""+str(Neither_prior),""A: ""+str(A_prior),""B: ""+str(B_prior)) # sanity check whether everything adds up assert Neither_count + A_count + B_count == test_total return A_prior, B_prior, Neither_prior A_prior,B_prior,Neither_prior = get_prior(gap_test) ",No,5,8.0 "sample_submission = pd.read_csv(""../input/sample_submission_stage_1.csv"") ",No,5,45.0 "def assign_prior(df): sub = pd.DataFrame() for index, row in df.iterrows(): sub.loc[index, ""ID""] = row[""ID""] sub.loc[index, ""A""] = A_prior sub.loc[index, ""B""] = B_prior sub.loc[index, ""NEITHER""] = Neither_prior return sub",No,5,8.0 "train = assign_prior(gap_test) valid = assign_prior(gap_valid)",No,5,53.0 "from sklearn.metrics import log_loss def get_gold(df): gold = [] for index, row in df.iterrows(): if (row[""A-coref""]): gold.append(""A"") else: if (row[""B-coref""]): gold.append(""B"") else: gold.append(""NEITHER"") return gold",No,5,53.0 "train_gold = get_gold(gap_test) valid_gold = get_gold(gap_valid)",No,5,8.0 "train_pred = train[[""A"",""B"",""NEITHER""]] log_loss(train_gold,train_pred)",No,3,8.0 "valid_pred = valid[[""A"",""B"",""NEITHER""]] log_loss(valid_gold,valid_pred)",No,3,8.0 sub1 = assign_prior(test_stage_1),No,3,8.0 sub1[0:4],No,5,41.0 "sub1.to_csv(""submission_1.csv"", index=False)",No,5,25.0 "train_female = assign_prior(female_gap_test) train_male = assign_prior(male_gap_test) valid_female = assign_prior(female_gap_valid) valid_male = assign_prior(male_gap_valid)",No,3,8.0 "train_gold_female = get_gold(female_gap_test) train_gold_male = get_gold(male_gap_test)",No,3,8.0 "train_pred_female = train_female[[""A"",""B"",""NEITHER""]] log_loss(train_gold_female,train_pred_female)",No,3,8.0 "train_pred_male = train_male[[""A"",""B"",""NEITHER""]] log_loss(train_gold_male,train_pred_male)",No,3,8.0 "test_df_pop=pd.merge(test_df, country_lookup, how='left', left_on='Country_Region', right_on='Country (or dependency)')",No,5,32.0 test_df_pop.info(),No,5,40.0 "train_df_pop.drop(""Country (or dependency)"", axis=1, inplace=True)",No,5,10.0 "test_df_pop.drop(""Country (or dependency)"", axis=1, inplace=True)",No,5,10.0 %load_ext google.cloud.bigquery,No,5,53.0 "weather_df[""da""]=weather_df[""da""].astype(int)",No,5,16.0 "weather_df['day_from_jan_first'] = weather_df[""da""] + 31*(weather_df[""mo""]=='02') + 60*(weather_df[""mo""]=='03') + 91*(weather_df[""mo""]=='04') '",No,5,8.0 "train_wk1=pd.read_csv(""../input/training-dataset-from-covid-19-week-1-forecasting/train-3.csv"") ",No,5,45.0 "train_wk1['country+province'] = train_wk1['Country/Region'].fillna('') + '-' + train_wk1['Province/State'].fillna('') train_df_pop['country+province'] = train_df_pop['Country_Region'].fillna('') + '-' + train_df_pop['Province_State'].fillna('') test_df_pop['country+province'] = test_df_pop['Country_Region'].fillna('') + '-' + test_df_pop['Province_State'].fillna('')",No,4,17.0 "df = train_wk1.groupby('country+province')[['Lat', 'Long']].mean()",No,5,60.0 "train_df_pop.reset_index(drop=True, inplace=True)",No,5,84.0 "train_df_pop[""Id""]=train_df_pop.index",No,5,8.0 train_df_pop.isnull().sum(),No,5,39.0 labelencoder = LabelEncoder(),No,5,20.0 train_df_pop['Country_Region_Types'] = labelencoder.fit_transform(train_df_pop['Country_Region']),No,5,20.0 test_df_pop['Country_Region_Types'] = labelencoder.fit_transform(test_df_pop['Country_Region']),No,5,20.0 train_df_pop.head(),No,5,41.0 test_df_pop.head(),No,5,41.0 "train_df_pop.rename(columns={""Population (2020)"":""Population""}, inplace=True)",No,5,61.0 "test_df_pop.rename(columns={""Population (2020)"":""Population""}, inplace=True)",No,5,61.0 X_dataset=train_df_pop.copy(),No,5,12.0 "X_dataset=X_dataset[[""Date"",""Population"",""Density"",""day_from_jan_first"",""temp"",""min"",""max"",""stp"",""wdsp"",""prcp"",""fog"",""Country_Region_Types"",""Lat"",""Long""]]",No,5,10.0 "X_dataset[""Date""] = X_dataset[""Date""].apply(lambda x:x.replace(""-"","""")) X_dataset[""Date""] = X_dataset[""Date""].astype(int)",No,3,16.0 X_dataset.info(),No,5,40.0 X_dataset.head(),No,5,41.0 "X_dataset[""fog""] = X_dataset[""fog""].astype(int) X_dataset[""wdsp""] = X_dataset[""wdsp""].astype(float)",No,5,16.0 "X_dataset[""Date""].max()",No,5,40.0 "valid_gold_female = get_gold(female_gap_valid) valid_gold_male = get_gold(male_gap_valid)",No,3,8.0 "valid_pred_female = valid_female[[""A"",""B"",""NEITHER""]] log_loss(valid_gold_female,valid_pred_female)",No,3,8.0 "valid_pred_male = valid_male[[""A"",""B"",""NEITHER""]] log_loss(valid_gold_male,valid_pred_male)",No,3,8.0 len(female_test_stage_2),No,4,58.0 len(male_test_stage_2),No,5,58.0 sub2 = assign_prior(test_stage_2),No,3,8.0 sub2.head(),No,5,41.0 "sub2.to_csv(""submission.csv"", index=False)",No,5,25.0 "import numpy as np import pandas as pd import spacy from spacy import displacy nlp = spacy.load(""en_core_web_sm"") import nltk from sklearn import * from sklearn.model_selection import KFold from sklearn.ensemble import RandomForestClassifier import xgboost as xgb from xgboost import XGBClassifier import lightgbm as lgb import warnings warnings.filterwarnings(""ignore"") import time",Yes,4,45.0 "test = pd.read_csv(""../input/test_stage_1.tsv"", delimiter=""\\t"").rename(columns={""A"": ""A_Noun"", ""B"": ""B_Noun""}) sub = pd.read_csv(""../input/sample_submission_stage_1.csv"") test.shape, sub.shape'",No,4,45.0 "gh_test = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv"", delimiter='\\t') gh_valid = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv"", delimiter='\\t') train = pd.concat((gh_test, gh_valid)).rename(columns={'A': 'A_Noun', 'B': 'B_Noun'}).reset_index(drop=True) train.shape'",Yes,4,45.0 "def name_replace(s, r1, r2): s = str(s).replace(r1,r2) for r3 in r1.split("" ""): s = str(s).replace(r3,r2) return s def get_features(df): df['section_min'] = df[['Pronoun-offset', 'A-offset', 'B-offset']].min(axis=1) df['Pronoun-offset2'] = df['Pronoun-offset'] + df['Pronoun'].map(len) df['A-offset2'] = df['A-offset'] + df['A_Noun'].map(len) df['B-offset2'] = df['B-offset'] + df['B_Noun'].map(len) df['section_max'] = df[['Pronoun-offset2', 'A-offset2', 'B-offset2']].max(axis=1) df['Text'] = df.apply(lambda r: name_replace(r['Text'], r['A_Noun'], 'subjectone'), axis=1) df['Text'] = df.apply(lambda r: name_replace(r['Text'], r['B_Noun'], 'subjecttwo'), axis=1) df['A-dist'] = (df['Pronoun-offset'] - df['A-offset']).abs() df['B-dist'] = (df['Pronoun-offset'] - df['B-offset']).abs() return(df) train = get_features(train) test = get_features(test)'",Yes,4,8.0 "%%time def get_nlp_features(s, w): doc = nlp(str(s)) tokens = pd.DataFrame([[token.text, token.dep_] for token in doc], columns=['text', 'dep']) return len(tokens[((tokens['text']==w) & (tokens['dep']=='poss'))]) train['A-poss'] = train['Text'].map(lambda x: get_nlp_features(x, 'subjectone')) train['B-poss'] = train['Text'].map(lambda x: get_nlp_features(x, 'subjecttwo')) test['A-poss'] = test['Text'].map(lambda x: get_nlp_features(x, 'subjectone')) test['B-poss'] = test['Text'].map(lambda x: get_nlp_features(x, 'subjecttwo'))",No,5,8.0 "train = train.rename(columns={""A-coref"": ""A"", ""B-coref"": ""B""}) train[""A""] = train[""A""].astype(int) train[""B""] = train[""B""].astype(int) train[""NEITHER""] = 1.0 - (train[""A""] + train[""B""])",No,4,16.0 "col = [""Pronoun-offset"", ""A-offset"", ""B-offset"", ""section_min"", ""Pronoun-offset2"", ""A-offset2"", ""B-offset2"", ""section_max"", ""A-poss"", ""B-poss"", ""A-dist"", ""B-dist""] x1, x2, y1, y2 = model_selection.train_test_split(train[col].fillna(-1), train[[""A"", ""B"", ""NEITHER""]], test_size=0.2, random_state=1) x1.head()",No,4,13.0 "# set hyper parameters lgb_params = {""learning_rate"": 0.01, ""num_leaves"": 16, ""min_data_in_leaf"": 20, ""boosting"": ""gbdt"", ""num_iterations"": 120, ""bagging_fraction"": 0.6, ""feature_fraction"": 1.0, ""seed"": 42, ""num_threads"": -1 } """""" xgb_params = {""eta"": 0.05, ""max_depth"": 2, ""n_estimators"": 120, ""objective"": ""binary:logistic"", ""eval_metric"": ""logloss"", ""booster"": ""gbtree"", ""subsample"": 0.6, ""colsample_bytree"": 0.6, ""seed"": 42, ""n_jobs"": -1 } """""" #model = multiclass.OneVsRestClassifier(ensemble.RandomForestClassifier(max_depth=7, n_estimators=1000, random_state=33)) #model = multiclass.OneVsRestClassifier(xgb.XGBClassifier(**xgb_params)) model = multiclass.OneVsRestClassifier(lgb.LGBMClassifier(**lgb_params)) # 5 fold CV folds = 5 kf = KFold(n_splits=folds, shuffle=False, random_state=11) trn = train[col].fillna(-1) val = train[[""A"", ""B"", ""NEITHER""]] scores = [] i = 0 for train_index, test_index in kf.split(train): x1, x2 = trn.iloc[train_index], trn.iloc[test_index] y1, y2 = val.iloc[train_index], val.iloc[test_index] model.fit(x1, y1) score = metrics.log_loss(y2, model.predict_proba(x2)) print(str(i+1), ""log-loss:"", score) scores.append(score) i += 1 print(""CV Score(log-loss):"", np.mean(scores)) model.fit(train[col].fillna(-1), train[[""A"", ""B"", ""NEITHER""]]) results = model.predict_proba(test[col]) test[""A""] = results[:,0] test[""B""] = results[:,1] test[""NEITHER""] = results[:,2] test[[""ID"", ""A"", ""B"", ""NEITHER""]].to_csv(""submission.csv"", index=False)",Yes,3,7.0 "b""test = pd.read_csv('../input/test_stage_1.tsv', delimiter='\\t').rename(columns={'A': 'A_Noun', 'B': 'B_Noun'})\nsub = pd.read_csv('../input/sample_submission_stage_1.csv')\ntest.shape, sub.shape""",No,4,45.0 "# True test here: #gh_train = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-development.tsv"", delimiter='\\t') gh_test = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-test.tsv"", delimiter='\\t') gh_valid = pd.read_csv(""https://raw.githubusercontent.com/google-research-datasets/gap-coreference/master/gap-validation.tsv"", delimiter='\\t') train = pd.concat((gh_test, gh_valid)).rename(columns={'A': 'A_Noun', 'B': 'B_Noun'}).reset_index(drop=True) train.shape'",No,4,45.0 "def get_coref(row): coref = None nlpr = nlp(row['Text']) # dunno if more direct way to get token from text offset for tok in nlpr.doc: if tok.idx == row['Pronoun-offset']: # model limitation that sometimes there are no coref clusters for the token? # also, sometimes the coref clusters will just be something like: # He: his, him, his # So there is no proper name to map back to? try: if len(tok._.coref_clusters) > 0: coref = tok._.coref_clusters[0][0].text except: # for some, get the following exception just checking len(tok._.coref_clusters) # *** TypeError: 'NoneType' object is not iterable pass break if coref: coref = coref.lower() # sometimes the coref is I think meant to be the same as A or B, but # it is either a substring or superstring of A or B A_Noun = row['A_Noun'].lower() B_Noun = row['B_Noun'].lower() if coref in A_Noun or A_Noun in coref: coref = A_Noun elif coref in B_Noun or B_Noun in coref: coref = B_Noun return coref",Yes,2,8.0 "def get_coref_features(df): df['Coref'] = df.apply(get_coref, axis=1) df['Spacy-Coref-A'] = df['Coref'] == df['A_Noun'].str.lower() df['Spacy-Coref-B'] = df['Coref'] == df['B_Noun'].str.lower() return df train = get_coref_features(train) test = get_coref_features(test)",No,2,8.0 "def name_replace(s, r1, r2): s = str(s).replace(r1,r2) for r3 in r1.split(' '): s = str(s).replace(r3,r2) return s def get_features(df): df['section_min'] = df[['Pronoun-offset', 'A-offset', 'B-offset']].min(axis=1) df['Pronoun-offset2'] = df['Pronoun-offset'] + df['Pronoun'].map(len) df['A-offset2'] = df['A-offset'] + df['A_Noun'].map(len) df['B-offset2'] = df['B-offset'] + df['B_Noun'].map(len) df['section_max'] = df[['Pronoun-offset2', 'A-offset2', 'B-offset2']].max(axis=1) #df['Text'] = df.apply(lambda r: r['Text'][: r['Pronoun-offset']] + 'pronountarget' + r['Text'][r['Pronoun-offset'] + len(str(r['Pronoun'])): ], axis=1) df['Text'] = df.apply(lambda r: name_replace(r['Text'], r['A_Noun'], 'subjectone'), axis=1) df['Text'] = df.apply(lambda r: name_replace(r['Text'], r['B_Noun'], 'subjecttwo'), axis=1) df['A-dist'] = (df['Pronoun-offset'] - df['A-offset']).abs() df['B-dist'] = (df['Pronoun-offset'] - df['B-offset']).abs() return(df) train = get_features(train) test = get_features(test)",No,3,8.0 "col = ['Pronoun-offset', 'A-offset', 'B-offset', 'section_min', 'Pronoun-offset2', 'A-offset2', 'B-offset2', 'section_max', 'A-poss', 'B-poss', 'A-dist', 'B-dist', 'Spacy-Coref-A', 'Spacy-Coref-B'] x1, x2, y1, y2 = model_selection.train_test_split(train[col].fillna(-1), train[['A', 'B', 'NEITHER']], test_size=0.2, random_state=1) x1.head()",No,4,13.0 "model = multiclass.OneVsRestClassifier(ensemble.RandomForestClassifier(max_depth = 7, n_estimators=1000, random_state=33)) # model = multiclass.OneVsRestClassifier(ensemble.ExtraTreesClassifier(n_jobs=-1, n_estimators=100, random_state=33)) # param_dist = {'objective': 'binary:logistic', 'max_depth': 1, 'n_estimators':1000, 'num_round':1000, 'eval_metric': 'logloss'} # model = multiclass.OneVsRestClassifier(xgb.XGBClassifier(**param_dist)) model.fit(x1, y1) print('log_loss', metrics.log_loss(y2, model.predict_proba(x2))) model.fit(train[col].fillna(-1), train[['A', 'B', 'NEITHER']]) results = model.predict_proba(test[col]) test['A'] = results[:,0] test['B'] = results[:,1] test['NEITHER'] = results[:,2] test[['ID', 'A', 'B', 'NEITHER']].to_csv('submission.csv', index=False)",No,3,7.0 "import os import csv import json import string import keras from pandas.io.json import json_normalize import matplotlib.pyplot as plt import seaborn as sns color = sns.color_palette() from math import floor import spacy %matplotlib inline from plotly import tools import plotly.offline as py py.init_notebook_mode(connected=True) import plotly.graph_objs as go from sklearn import model_selection, preprocessing, metrics, ensemble, naive_bayes, linear_model from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.decomposition import TruncatedSVD import lightgbm as lgb import time from tqdm import tqdm import math from sklearn.model_selection import train_test_split import regex as re",No,5,23.0 nlp = spacy.load('en_core_web_sm'),No,5,30.0 "y1=train_df_pop[train_df_pop[""Date""]<""2020-03-19""][""ConfirmedCases""] y2=train_df_pop[train_df_pop[""Date""]<""2020-03-19""][""Fatalities""]",No,5,14.0 print(find_name_between_paran(analyze)),No,5,53.0 "#Confirmed Cases X_train_confirmed, X_test_confirmed, y_train_confirmed, y_test_confirmed = train_test_split(X_dataset, y1, test_size = .20, random_state = 42)",No,5,13.0 "results_df = pd.DataFrame({""correct"":correct_name_list}) results_df['prv_obj'] = prev_dobj_list results_df['pr_ls_sbj'] = prev_subj_list results_df['pr_1_sbj'] = prev_first_subj_list results_df['pr_2_sbj'] = prev_second_subj_list results_df['c1st_sj'] = curr1st_subj_list results_df['c2nd_sj'] = curr2nd_subj_list results_df['c1st_ob'] = curr1st_dobj_list results_df['c2nd_ob'] = curr2nd_dobj_list results_df['c1st_ap'] = curr1st_appos_list results_df['w_bt_pa'] = word_btwn_paran_list results_df['pronoun'] = pronoun_list results_df['offset'] = pronoun_offset_list'",No,3,55.0 "import plotly.express as px import pandas as pd import plotly.graph_objects as go import numpy as np datafile = '../input/covid19-global-forecasting-week-2/train.csv' data = pd.read_csv(datafile) data['PSCR'] = data.Province_State.map(str)+data.Country_Region.map(str) # %% # ip pattern of the empirical data from 2020/03/19 onwards region = pd.unique(data['PSCR']).tolist() f_region = [] time_list = [] region_name = [] actual_date = [] no_infection_country = [] for ci in range(len(region)): region_data = data[data['PSCR'] == region[ci]] region_data = region_data[region_data.ConfirmedCases > 0] inc_percentage = (region_data.ConfirmedCases[1:].to_numpy( )-region_data.ConfirmedCases[:-1].to_numpy())/region_data.ConfirmedCases[:-1].to_numpy() # Only considering the countries with effective data if len(np.where(inc_percentage > 0)[0]) > 0: inc_percentage = inc_percentage[np.where(inc_percentage > 0)[0][0]:] actual_date.append(region_data.Date[1:]) f_region.extend(inc_percentage) time_list.extend([i for i in range(len(inc_percentage))]) region_name.extend([region[ci] for i in range(len(inc_percentage))]) else: no_infection_country.append(region[ci]) f_df = pd.DataFrame( {'increase': f_region, 'Day': time_list, 'PSCR': region_name}) # %% # Simulation data for training sim_data = [] speed = [0.01,0.1,1] for batch in range(1,4): result = f'../input/simulation-scripts/outfile_s{batch}.npz' container = np.load(result) speed_batch = f'Sim: speed {speed[batch-1]}' sim_result = [container[key] for key in container] num_infected = [] for t in range(len(sim_result)): num_infected.append(len(np.where(sim_result[t] < 30)[0])) inc_infected = [(num_infected[i+1]-num_infected[i])/num_infected[i] for i in range(len(num_infected)-1)] infected_growth_df = pd.DataFrame({'increase': inc_infected, 'Day': [ i for i in range(len(sim_result)-1)], 'PSCR': speed_batch}) sim_data.append(infected_growth_df) sim_df = pd.concat(sim_data) # %% criteria_day_length = 10 sim_class_ip = [] for speed in pd.unique(sim_df.PSCR): sim_class_ip.append(sim_df[sim_df['PSCR'] == speed].increase.tolist()) sim_class_ip_array = np.array(sim_class_ip) #%% labels = [] effective_region = [] for region_loop in region: if region_loop not in no_infection_country: ip = f_df[f_df['PSCR'] == region_loop].increase[:criteria_day_length].tolist() euclidean_dis = np.linalg.norm(np.array(ip)-sim_class_ip_array[:,:len(ip)],axis = 1) labels.append(np.where(euclidean_dis == min(euclidean_dis))[0][0]) effective_region.append(region_loop) else: pass xlabels = ['Slow','Moderate','Fast'] scenario_class = {'ip': [xlabels[i] for i in labels], 'Area':effective_region, 'width': [1 for i in range(len(labels))]} sce_df = pd.DataFrame(scenario_class) #%% fig = px.bar(sce_df, x=""ip"", y=""width"", color='Area', height=400) fig.update_layout(title='Strategies of regions', xaxis_title='Strategy', yaxis_title='Areas and regions', xaxis=dict( showline=True, showgrid=False, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', tickfont=dict( family='Arial', size=12, color='rgb(82, 82, 82)', ) ), yaxis=dict( showline=True, showgrid=False, showticklabels=True, linecolor='rgb(204, 204, 204)', linewidth=2, ticks='outside', tickfont=dict( family='Arial', size=12, color='rgb(82, 82, 82)', ), ), autosize=True, plot_bgcolor='white', height=600, width=800, ) fig.show()'",Yes,2,45.0 results_df.head(250),No,5,41.0 "dt1=DecisionTreeRegressor(criterion=""friedman_mse"",max_depth=20,random_state=42)",No,5,4.0 "dt1.fit(X_train_confirmed, y_train_confirmed)",No,5,7.0 print(text_list[229]),No,5,41.0 y_pred_dt_confirmed=dt1.predict(X_test_confirmed),No,5,48.0 "np.sqrt(mean_squared_log_error( y_test_confirmed, y_pred_dt_confirmed ))",No,5,49.0 "train = pd.concat([train,pd.get_dummies(train['Country_Region'], prefix='ps')],axis=1) train.drop(['Country_Region'],axis=1, inplace=True) train = pd.concat([train,pd.get_dummies(train['Province_State'], prefix='ps')],axis=1) train.drop(['Province_State'],axis=1, inplace=True)",Yes,3,11.0 "doc = nlp(analyze) for token in doc: print(token.text, token.dep_, token.head.pos_)",No,5,41.0 "#Fatalities X_train_fatal, X_test_fatal, y_train_fatal, y_test_fatal = train_test_split(X_dataset, y2, test_size = .20, random_state = 42)",No,5,13.0 print(text_list[900:950]),No,5,41.0 "out_df = pd.DataFrame({""ID"":test_ids})",No,5,12.0 "# Using the data on 18 Mar to calculate the tendency of the pandemic. date_datause = '2020-03-18' date_actualdata = '2020-03-30' date_length = (pd.to_datetime(date_actualdata) - pd.to_datetime(date_datause)).days predict_region_list = [] effect_ind = 0 for it in range(len(region)): region_it = region[it] if region_it not in no_infection_country: time_length_it = actual_date[effect_ind] sim_class_it = labels[effect_ind] predict_ip_it = sim_class_ip_array[sim_class_it,(len(actual_date[0])-date_length):] while len(predict_ip_it)< (date_length+31): predict_ip_it = np.append(predict_ip_it,predict_ip_it[len(predict_ip_it)-1]) retion_df = data[data['PSCR'] == region_it] num_infected_it = retion_df[retion_df['Date'] == date_datause]['ConfirmedCases'].astype(float) predict_region_list_it = [] ini_infected = num_infected_it.tolist()[0] for predict_day in range(len(predict_ip_it)): predict_region_list_it.append(ini_infected * (1+predict_ip_it[predict_day])) ini_infected = predict_region_list_it[predict_day] predict_region_list.extend(predict_region_list_it) effect_ind += 1 else: predict_region_list.extend([0 for i in range(43)]) # %% # Write output csv file import csv from itertools import zip_longest list1 = [i+1 for i in range(len(predict_region_list))] list2 = predict_region_list list3 = [0 for i in range(len(predict_region_list))] d = [list1, list2,list3] export_data = zip_longest(*d, fillvalue = '') with open('submission.csv', 'w', encoding=""ISO-8859-1"", newline='') as myfile: wr = csv.writer(myfile) wr.writerow((""ForecastId"", ""ConfirmedCases"", ""Fatalities"")) wr.writerows(export_data) myfile.close()'",Yes,2,25.0 "# aggregate cases and fatalities def do_aggregation(df, col, mean_range): df_new = copy.deepcopy(df) col_new = '{}_({}-{})'.format(col, mean_range[0], mean_range[1]) df_new[col_new] = 0 tmp = df_new[col].rolling(mean_range[1]-mean_range[0]+1).mean() df_new[col_new][mean_range[0]:] = tmp[:-(mean_range[0])] df_new[col_new][pd.isna(df_new[col_new])] = 0 return df_new[[col_new]].reset_index(drop=True) def do_aggregations(df): df = pd.concat([df, do_aggregation(df, 'cases/day', [1,1]).reset_index(drop=True)], axis=1) df = pd.concat([df, do_aggregation(df, 'cases/day', [1,7]).reset_index(drop=True)], axis=1) df = pd.concat([df, do_aggregation(df, 'cases/day', [8,14]).reset_index(drop=True)], axis=1) df = pd.concat([df, do_aggregation(df, 'cases/day', [15,21]).reset_index(drop=True)], axis=1) df = pd.concat([df, do_aggregation(df, 'fatal/day', [1,1]).reset_index(drop=True)], axis=1) df = pd.concat([df, do_aggregation(df, 'fatal/day', [1,7]).reset_index(drop=True)], axis=1) df = pd.concat([df, do_aggregation(df, 'fatal/day', [8,14]).reset_index(drop=True)], axis=1) df = pd.concat([df, do_aggregation(df, 'fatal/day', [15,21]).reset_index(drop=True)], axis=1) for threshold in [1, 10, 100]: days_under_threshold = (df['ConfirmedCases'] 0: candidate = tc_name_words[-1] if candidate in get_tc_f_s or candidate in get_tc_l_s: if len(tc_name_words) > 1: candidate = tc_name_words[-1] if check_if_name(curr_tok,candidate): get_tc_l_nw = candidate ### tclnw Random forest classifier label special line: if get_tc_l_nw in row[inquiry_part] or row[inquiry_part] in get_tc_l_nw: train_vector.append(1) else: train_vector.append(0) #get first aposs in trunc curr @@@@@@@@@@@@@@@@@@@@@@@@@@@ get_tc_f_a = ""none"" for n in [1,2,3,4]: dummy_tc_f_a = find_nth_appos(curr_doc,n) if check_if_name(curr_tok,dummy_tc_f_a) and get_tc_f_a == ""none"": get_tc_f_a = dummy_tc_f_a ### tcfa Random forest classifier label special line: if get_tc_f_a in row[inquiry_part] or row[inquiry_part] in get_tc_f_a: train_vector.append(1) else: train_vector.append(0) #get word btwn paranthesis in prev @@@@@@@@@@@@@@@@@@@@@@@@@@@ get_p_f_wp = find_name_words(name_btwn_paran(prev)) ### pfwp Random forest classifier label special line: if get_p_f_wp in row[inquiry_part] or row[inquiry_part] in get_p_f_wp: train_vector.append(1) else: train_vector.append(0) #get word btwn paranthesis in trunc curr @@@@@@@@@@@@@@@@@@@@@@@@@@@ get_tc_l_wp = find_name_words(name_btwn_paran(curr)) ### tclwp Random forest classifier label special line: if get_tc_l_wp in row[inquiry_part] or row[inquiry_part] in get_tc_l_wp: train_vector.append(1) else: train_vector.append(0) #get last subj in remainder @@@@@@@@@@@@@@@@@@@@@@@@@@@ get_r_f_s = ""none"" for n in [1,2,3,4,5,6,7,8]: #in the final version, each of the name subjects will be accunted for dummy_r_f_s = find_nth_subj(curr_doc,n) if dummy_r_f_s in remainder and check_if_name(curr_tok,dummy_r_f_s): get_r_f_s = dummy_r_f_s ### rfs Random forest classifier label special line: if get_r_f_s in row[inquiry_part] or row[inquiry_part] in get_r_f_s: train_vector.append(1) else: train_vector.append(0) #get last dobj in remainder @@@@@@@@@@@@@@@@@@@@@@@@@@@ get_r_f_o = ""none"" for n in [1,2,3,4,5,6,7,8]: #in the final version, each of the name objects will be accunted for dummy_r_f_o = find_nth_dobj(curr_doc,n) if dummy_r_f_o in remainder and check_if_name(curr_tok,dummy_r_f_o): get_r_f_o = dummy_r_f_o ### rfo Random forest classifier label special line: if get_r_f_o in row[inquiry_part] or row[inquiry_part] in get_r_f_o: train_vector.append(1) else: train_vector.append(0) #get last appos in remainder @@@@@@@@@@@@@@@@@@@@@@@@@@@ get_r_f_a = ""none"" for n in [1,2,3,4]: dummy_r_f_a = find_nth_appos(curr_doc,n) if dummy_r_f_a in remainder and check_if_name(curr_tok,dummy_r_f_a): get_r_f_a = dummy_r_f_a ### rfa Random forest classifier label special line: if get_r_f_a in row[inquiry_part] or row[inquiry_part] in get_r_f_a: train_vector.append(1) else: train_vector.append(0) #get first appos in current @@@@@@@@@@@@@@@@@@@@@@@@@@@ get_c_f_a = ""none"" for n in [1,2,3,4]: dummy_c_f_a = find_nth_appos(curr_doc,n) if check_if_name(curr_tok,dummy_c_f_a) and get_c_f_a == ""none"": get_c_f_a = dummy_c_f_a ### cfa Random forest classifier label special line: if get_c_f_a in row[inquiry_part] or row[inquiry_part] in get_c_f_a: train_vector.append(1) else: train_vector.append(0) #get first appos in prev @@@@@@@@@@@@@@@@@@@@@@@@@@@ get_p_f_a = ""none"" for n in [1,2,3,4]: dummy_p_f_a = find_nth_appos(prev_doc,n) if check_if_name(prev_tok,dummy_p_f_a) and get_p_f_a == ""none"": get_p_f_a = dummy_p_f_a ### pfa Random forest classifier label special line: if get_p_f_a in row[inquiry_part] or row[inquiry_part] in get_p_f_a: train_vector.append(1) else: train_vector.append(0) #check_if_poss_her get_poss_her = check_if_poss_her(curr_doc, pronoun) #rand_forest classifier for pronoun type: if pronoun == ""he"" or pronoun == ""she"": train_vector.append(1) elif pronoun == ""He"" or pronoun == ""She"": train_vector.append(2) elif pronoun == ""his"" or (pronoun == ""her"" and get_poss_her): train_vector.append(3) elif pronoun == ""him"" or (pronoun == ""her"" and not get_poss_her): train_vector.append(4) elif pronoun == ""His"" or (pronoun == ""Her"" and get_poss_her): train_vector.append(5) else: train_vector.append(6) return train_vector'",No,2,1.0 "def run_bert(data): \t''' \tRuns a forward propagation of BERT on input text, extracting contextual word embeddings \tInput: data, a pandas DataFrame containing the information in one of the GAP files \tOutput: emb, a pandas DataFrame containing contextual embeddings for the words A, B and Pronoun. Each embedding is a numpy array of shape (768) \tcolumns: ""emb_A"": the embedding for word A \t ""emb_B"": the embedding for word B \t ""emb_P"": the embedding for the pronoun \t ""label"": the answer to the coreference problem: ""A"", ""B"" or ""NEITHER"" \t''' # From the current file, take the text only, and write it in a file which will be passed to BERT \ttext = data[""Text""] \ttext.to_csv(""input.txt"", index = False, header = False) \ttask_name = ""kepler"" #\tprocessors = {""kepler"": run_classifier.KeplerProcessor} #\tprocessors = {""kepler"": run_classifier.MrpcProcessor} #processor = processors[""kepler""] # The script extract_features.py runs forward propagation through BERT, and writes the output in the file output.jsonl # I'm lazy, so I'm only saving the output of the last layer. Feel free to change --layers = -1 to save the output of other layers. \tos.system(""python3 extract_features.py \\ \t --input_file=input.txt \\ \t --output_file=output.jsonl \\ \t --vocab_file=uncased_L-12_H-768_A-12/vocab.txt \\ \t --bert_config_file=uncased_L-12_H-768_A-12/bert_config.json \\ \t --init_checkpoint=uncased_L-12_H-768_A-12/bert_model.ckpt \\ \t --layers=-2 \\ \t --max_seq_length=256 \\ \t --batch_size=8"") \tbert_output = pd.read_json(""output.jsonl"", lines = True) \tos.system(""rm output.jsonl"") \tos.system(""rm input.txt"") \tindex = data.index \tcolumns = [""emb_A"", ""emb_B"", ""emb_P"", ""feat_A"", ""feat_B"", ""label""] \temb = pd.DataFrame(index = index, columns = columns) \temb.index.name = ""ID"" \tfor i in range(len(data)): # For each line in the data file \t\t# get the words A, B, Pronoun. Convert them to lower case, since we're using the uncased version of BERT \t\tP = data.loc[i,""Pronoun""] \t\tA = data.loc[i,""A""] \t\tB = data.loc[i,""B""] \t\t# For each word, find the offset not counting spaces. This is necessary for comparison with the output of BERT \t\tP_offset = compute_offset_no_spaces(data.loc[i,""Text""], data.loc[i,""Pronoun-offset""]) \t\tA_offset = compute_offset_no_spaces(data.loc[i,""Text""], data.loc[i,""A-offset""]) \t\tB_offset = compute_offset_no_spaces(data.loc[i,""Text""], data.loc[i,""B-offset""]) \t\t# Figure out the length of A, B, not counting spaces or special characters \t\tA_length = count_length_no_special(A) \t\tB_length = count_length_no_special(B) \t\t# Initialize embeddings with zeros \t\temb_A = np.zeros(768) \t\temb_B = np.zeros(768) \t\temb_P = np.zeros(768) \t\t# Initialize counts \t\tcount_chars = 0 \t\tcnt_A, cnt_B, cnt_P = 0, 0, 0 \t\tfeatures = pd.DataFrame(bert_output.loc[i,""features""]) # Get the BERT embeddings for the current line in the data file \t\tfor j in range(2,len(features)): # Iterate over the BERT tokens for the current line; we skip over the first 2 tokens, which don't correspond to words \t\t\ttoken = features.loc[j,""token""] \t\t\t# See if the character count until the current token matches the offset of any of the 3 target words \t\t\tif count_chars == P_offset: \t\t\t\t# print(token) \t\t\t\temb_P += np.array(features.loc[j,""layers""][0]['values']) \t\t\t\tcnt_P += 1 \t\t\tif count_chars in range(A_offset, A_offset + A_length): \t\t\t\t# print(token) \t\t\t\temb_A += np.array(features.loc[j,""layers""][0]['values']) \t\t\t\tcnt_A += 1 \t\t\tif count_chars in range(B_offset, B_offset + B_length): \t\t\t\t# print(token) \t\t\t\temb_B += np.array(features.loc[j,""layers""][0]['values']) \t\t\t\tcnt_B += 1 # Update the character count \t\t\tcount_chars += count_length_no_special(token) \t\t# Taking the average between tokens in the span of A or B, so divide the current value by the count\t \t\temb_A /= cnt_A \t\temb_B /= cnt_B \t\t# Work out the label of the current piece of text \t\tlabel = ""Neither"" \t\tif (data.loc[i,""A-coref""] == True): \t\t\tlabel = ""A"" \t\tif (data.loc[i,""B-coref""] == True): \t\t\tlabel = ""B"" \t\tpro_offset = data.loc[i,""Pronoun-offset""] \t\tthis_text = data.loc[i,""Text""] \t\tfeat_A = get_feature_vector(P, this_text, A, B, pro_offset, inquiry_part = ""A"") \t\tfeat_B = get_feature_vector(P, this_text, A, B, pro_offset, inquiry_part = ""B"") \t\t# Put everything together in emb \t\temb.iloc[i] = [emb_A, emb_B, emb_P, np.asarray(feat_A), np.asarray(feat_B), label] \treturn em",No,4,1.0 "from keras import backend, models, layers, initializers, regularizers, constraints, optimizers from keras import callbacks as kc from keras import optimizers as ko from sklearn.model_selection import cross_val_score, KFold, train_test_split from sklearn.metrics import log_loss import time dense_layer_sizes = [37] dropout_rate = 0.6 learning_rate = 0.001 n_fold = 5 batch_size = 32 epochs = 1000 patience = 100 # n_test = 100 lambd = 0.1 # L2 regularization",No,5,59.0 "class IsLayer(Layer): #Layer to be used after a dense one. It will multiply all the elements with each other. #In a sense, it allows the neurons to have a say on each others' outputs. This layer, hopefully, #compares the relative importance of neurons.The compound prob is regulated with weights. #The idea follows from attention layer, but is more basic than that. As it is multiplicative, it is #an alternative to the vanilla additive layer where outputs are added at the next layer. def __init__(self, **kwargs): super(IsLayer, self).__init__(**kwargs) def build(self, input_shape): #Create a trainable weight variable for this layer. self.W = self.add_weight(name='W', shape=(input_shape[1], 1), initializer='uniform', trainable=True) super(IsLayer, self).build(input_shape) def call(self, x): x_W = K.dot(x, self.W) x_new = x*x_W return x_new def compute_output_shape(self, input_shape): return (input_shape[0], input_shape[-1])",No,5,53.0 "def build_mlp_model(input_shape, num_output): \tX_input = layers.Input(input_shape) \t# First dense layer \tX = layers.Dense(dense_layer_sizes[0], name = 'dense0')(X_input) \tX = layers.BatchNormalization(name = 'bn0')(X) \tX = layers.Activation('relu')(X) \tX = layers.Dropout(dropout_rate, seed = 7)(X) #\tX = IsLayer()(X) \t# Second dense layer # \tX = layers.Dense(dense_layer_sizes[0], name = 'dense1')(X) # \tX = layers.BatchNormalization(name = 'bn1')(X) # \tX = layers.Activation('relu')(X) # \tX = layers.Dropout(dropout_rate, seed = 9)(X) \t# Output layer \tX = layers.Dense(num_output, name = 'output', kernel_regularizer = regularizers.l2(lambd))(X) \tX = layers.Activation('sigmoid')(X) \t# Create model \tmodel = models.Model(input = X_input, output = X, name = ""classif_model"") \treturn model'",No,5,4.0 "def parse_json(embeddings): \t''' \tParses the embeddigns given by BERT, and suitably formats them to be passed to the MLP model \tInput: embeddings, a DataFrame containing contextual embeddings from BERT, as well as the labels for the classification problem \tcolumns: ""emb_A"": contextual embedding for the word A \t ""emb_B"": contextual embedding for the word B \t ""emb_P"": contextual embedding for the pronoun \t ""label"": the answer to the coreference problem: ""A"", ""B"" or ""NEITHER"" \tOutput: X, a numpy array containing, for each line in the GAP file, the concatenation of the embeddings of the target words \t Y, a numpy array containing, for each line in the GAP file, the one-hot encoded answer to the coreference problem \t''' \tembeddings.sort_index(inplace = True) # Sorting the DataFrame, because reading from the json file messed with the order \tX = np.zeros((len(embeddings)*2,2*768+19)) #19 is the length of special feature vector \tY = np.zeros((len(embeddings)*2, 1)) \t# Concatenate features (A first batch) \tfor i in range(len(embeddings)): \t\tA = np.array(embeddings.loc[i,""emb_A""]) \t\tP = np.array(embeddings.loc[i,""emb_P""]) \t\tF = np.array(embeddings.loc[i,""feat_A""]) \t\tX[i] = np.concatenate((A, P, F)) \t# One-hot encoding for labels \tfor i in range(len(embeddings)): \t\tlabel = embeddings.loc[i,""label""] \t\tif label == ""A"": \t\t\tY[i] = 1 \t\telse: \t\t\tY[i] = 0 \t# Concatenate features (B second batch) \tfor i in range(len(embeddings)): \t\tB = np.array(embeddings.loc[i,""emb_B""]) \t\tP = np.array(embeddings.loc[i,""emb_P""]) \t\tF = np.array(embeddings.loc[i,""feat_B""]) \t\tX[i+len(embeddings)] = np.concatenate((B, P, F)) \t# One-hot encoding for labels ; A's and B's concatenated like same since they are symmetrical \tfor i in range(len(embeddings)): \t\tlabel = embeddings.loc[i,""label""] \t\tif label == ""B"": \t\t\tY[i+len(embeddings)] = 1 \t\telse: \t\t\tY[i+len(embeddings)] = 0 \treturn X, Y'",No,5,53.0 "# Read development embeddigns from json file - this is the output of Bert development = pd.read_json(""contextual_embeddings_gap_development.json"") X_development, Y_development = parse_json(development) validation = pd.read_json(""contextual_embeddings_gap_validation.json"") X_validation, Y_validation = parse_json(validation) test = pd.read_json(""contextual_embeddings_gap_test.json"") X_test, Y_test = parse_json(test)",No,3,1.0 "# There may be a few NaN values, where the offset of a target word is greater than the max_seq_length of BERT. # They are very few, so I'm just dropping the rows. remove_test = [row for row in range(len(X_test)) if np.sum(np.isnan(X_test[row]))] X_train = np.delete(X_test, remove_test, 0) Y_train = np.delete(Y_test, remove_test, 0) # We want predictions for all validation rows. So instead of removing rows, make them 0 remove_validation = [row for row in range(len(X_validation)) if np.sum(np.isnan(X_validation[row]))] X_validation[remove_validation] = np.zeros(2*768+19) # We want predictions for all development rows. So instead of removing rows, make them 0 remove_development = [row for row in range(len(X_development)) if np.sum(np.isnan(X_development[row]))] X_development[remove_development] = np.zeros(2*768+19)",No,5,17.0 "# Will train on data from the gap-test and gap-validation files, in total 2454 rows #X_train = np.concatenate((X_test, X_validation), axis = 0) #Y_train = np.concatenate((Y_test, Y_validation), axis = 0) # Will predict probabilities for data from the gap-development file; initializing the predictions #prediction = np.zeros((len(X_development),1)) # testing predictions val_prediction = np.zeros((len(X_validation),1)) # valid predictions",No,5,77.0 "# Training and cross-validation folds = KFold(n_splits=n_fold, shuffle=True, random_state=3) scores = [] for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train)): \t# split training and validation data \tprint('Fold', fold_n, 'started at', time.ctime()) \tX_tr, X_val = X_train[train_index], X_train[valid_index] \tY_tr, Y_val = Y_train[train_index], Y_train[valid_index] \t# Define the model, re-initializing for each fold \tclassif_model = build_mlp_model([X_train.shape[1]],1) \tclassif_model.compile(optimizer = optimizers.Adam(lr = learning_rate), loss = ""binary_crossentropy"") \tcallbacks = [kc.EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights = True)] \t# train the model \tclassif_model.fit(x = X_tr, y = Y_tr, epochs = epochs, batch_size = batch_size, callbacks = callbacks, validation_data = (X_val, Y_val), verbose = 0) \t# make predictions on validation and test data \tpred_valid = classif_model.predict(x = X_val, verbose = 0) \t# oof[valid_index] = pred_valid.reshape(-1,) \tscores.append(log_loss(Y_val, pred_valid)) val_prediction = classif_model.predict(x = X_validation, verbose = 0) # Print CV scores, as well as score on the test data print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores))) print(scores) print(""Test score:"", log_loss(Y_validation,val_prediction))'",No,3,1.0 "def build_neither_mlp(input_shape, num_output): \tX_input = layers.Input(input_shape) \t# First dense layer \tX = layers.Dense(dense_layer_sizes[0], name = 'dense0')(X_input) \tX = layers.BatchNormalization(name = 'bn0')(X) \tX = layers.Activation('relu')(X) \tX = layers.Dropout(dropout_rate, seed = 7)(X) # Output layer \tX = layers.Dense(num_output, name = 'output', kernel_regularizer = regularizers.l2(lambd))(X) \tX = layers.Activation('sigmoid')(X) \t# Create model \tmodel = models.Model(input = X_input, output = X, name = ""neither_model"") \treturn model'",No,5,4.0 "X_val_A = val_prediction[: int(len(val_prediction)/2)] X_val_B = val_prediction[int(len(val_prediction)/2) :] X_train_neither = np.concatenate((X_val_A, X_val_B), axis=1) Y_val_A = Y_validation[: int(len(Y_validation)/2)] Y_val_B = Y_validation[int(len(Y_validation)/2) :] Y_train_neither = 1 - Y_val_A - Y_val_B",No,5,21.0 "print(X_val_A.shape) print(X_val_B.shape) print(X_train_neither.shape) print(Y_val_A.shape) print(Y_val_B.shape) print(Y_train_neither.shape)",No,5,58.0 "neither_model = build_neither_mlp([X_train_neither.shape[1]],1) neither_model.compile(optimizer = optimizers.Adam(lr = learning_rate), loss = ""binary_crossentropy"") neither_model.fit(X_train_neither, y = Y_train_neither, epochs = epochs, batch_size = batch_size, validation_data = (X_train_neither, Y_train_neither), verbose = 0) dev_prediction = classif_model.predict(x = X_development, verbose = 0)",No,3,7.0 "X_dev_A = dev_prediction[: int(len(dev_prediction)/2)] X_dev_B = dev_prediction[int(len(dev_prediction)/2) :] X_dev_neither = np.concatenate((X_dev_A, X_dev_B), axis=1)",No,5,13.0 "dev_neither = neither_model.predict(x = X_dev_neither, verbose = 0)",No,5,27.0 "# Write the prediction to file for submission submission = pd.read_csv(""../input/sample_submission_stage_1.csv"", index_col = ""ID"") submission[""A""] = X_dev_A submission[""B""] = X_dev_B submission[""NEITHER""] = dev_neither submission.to_csv(""submission_bert.csv"")",No,4,25.0 "import pandas as pd import numpy as np import keras import spacy from collections import defaultdict from sklearn.metrics import log_loss import matplotlib.pyplot as plt %matplotlib inline import time import os print(os.listdir(""../input"")) # Any results you write to the current directory are saved as output.",No,5,88.0 nlp = spacy.load('en_core_web_lg'),No,5,30.0 "spacy_tok = SpacyTokenizer(""en"") tokenizer = Tokenizer(spacy_tok)",No,5,84.0 "df_pretrain.Text.apply(lambda x: len(tokenizer.process_text(x, spacy_tok))).describe()",No,5,40.0 "class Graph(): def __init__(self): """""" self.edges is a dict of all possible next nodes e.g. {'X': ['A', 'B', 'C', 'E'], ...} self.weights has all the weights between two nodes, with the two nodes as a tuple as the key e.g. {('X', 'A'): 7, ('X', 'B'): 2, ...} """""" self.edges = defaultdict(list) self.weights = {} def add_edge(self, from_node, to_node, weight, back_penalty=1): # Note: assumes edges are bi-directional self.edges[from_node].append(to_node) self.edges[to_node].append(from_node) self.weights[(from_node, to_node)] = weight self.weights[(to_node, from_node)] = weight*back_penalty def dijsktra(graph, initial, end): # shortest paths is a dict of nodes # whose value is a tuple of (previous node, weight) shortest_paths = {initial: (None, 0)} current_node = initial visited = set() while current_node != end: visited.add(current_node) destinations = graph.edges[current_node] weight_to_current_node = shortest_paths[current_node][1] for next_node in destinations: weight = graph.weights[(current_node, next_node)] + weight_to_current_node if next_node not in shortest_paths: shortest_paths[next_node] = (current_node, weight) else: current_shortest_weight = shortest_paths[next_node][1] if current_shortest_weight > weight: shortest_paths[next_node] = (current_node, weight) next_destinations = {node: shortest_paths[node] for node in shortest_paths if node not in visited} if not next_destinations: raise Exception(""Something is wrong"") # next node is the destination with the lowest weight current_node = min(next_destinations, key=lambda k: next_destinations[k][1]) # Work back through destinations in shortest path path = [] dist = 0 while current_node is not None: path.append(current_node) next_node = shortest_paths[current_node][0] dist += shortest_paths[current_node][1] current_node = next_node # Reverse path path = path[::-1] return path, dist '",No,5,53.0 "def get_rank(token): """"""Step up with token.head until it reaches the root. Returns with step number and root"""""" i = 0 next_token = token while(next_token!=next_token.head): i+=1 next_token=next_token.head return i, next_token def child_count(token): cc = 0 for child in token.children: cc+=1 return cc",No,5,53.0 "def build_answers(data): answers = [] for i in range(len(data)): dataNext = data.loc[i] Acoref = dataNext[""A-coref""] Bcoref = dataNext[""B-coref""] answerNext = [int(Acoref), int(Bcoref), 1-int(Acoref or Bcoref)] answers.append(answerNext) return np.vstack(answers)",No,5,53.0 "def build_features(data): """"""Generates features from input data"""""" features = [] sum_good = 0 for i in range(0,len(data)): fi = [] dataNext = data.loc[i] text = dataNext[""Text""] #print(visualise(dataNext)) doc=nlp(text) Aoff = dataNext[""A-offset""] Boff = dataNext[""B-offset""] Poff = dataNext[""Pronoun-offset""] lth = len(text) for token in doc: if(token.idx==Aoff): Atoken = token if(token.idx==Boff): Btoken = token if(token.idx==Poff): Ptoken=token Arank, Aroot = get_rank(Atoken) Brank, Broot = get_rank(Btoken) Prank, Proot = get_rank(Ptoken) graph = Graph() for token in doc: graph.add_edge(token, token.head, 1, 4) sent_root = [] for sent in doc.sents: sent_root.append(sent.root) for j in range(len(sent_root)-1): graph.add_edge(sent_root[j], sent_root[j+1],1, 4) try: _, Alen = dijsktra(graph, Atoken, Ptoken) except: Alen = 300 try: _, Blen = dijsktra(graph, Btoken, Ptoken) except: Blen = 300 sent_num = len(sent_root) for i in range(len(sent_root)): if Aroot == sent_root[i]: Atop = i if Broot == sent_root[i]: Btop = i if Proot == sent_root[i]: Ptop = i fi.append(Aoff/lth)#0 fi.append(Boff/lth)#1 fi.append(Poff/lth)#2 fi.append(1.0*Atop/sent_num)#3 fi.append(1.0*Btop/sent_num)#4 fi.append(1.0*Ptop/sent_num)#5 fi.append(Arank/10)#6 fi.append(Brank/10)#7 fi.append(Prank/10)#8 #fi.append(Atoken.similarity(Ptoken))#9 #fi.append(Btoken.similarity(Ptoken))#10 #fi.append(Alen/300)#9 #fi.append(Blen/300)#10 #fi.append(child_count(Aroot))#11 #fi.append(child_count(Broot))#12 #fi.append(child_count(Proot))#13 features.append(fi) return np.vstack(features) def swap_raws(data, i, j): """"""Swap the ith and jth column of the data"""""" new_data = np.copy(data) temp = np.copy(new_data[:, i]) new_data[:,i] = new_data[:,j] new_data[:,j] = temp return new_data",No,5,53.0 "!pip install pytorch-pretrained-bert !pip install https://github.com/ceshine/pytorch_helper_bot/archive/0.0.4.zip",No,3,22.0 "import os # This variable is used by helperbot to make the training deterministic os.environ[""SEED""] = ""420"" import logging from pathlib import Path import torch import torch.nn as nn import numpy as np import pandas as pd from torch.utils.data import Dataset, DataLoader from pytorch_pretrained_bert import BertTokenizer from pytorch_pretrained_bert.modeling import BertModel from helperbot import BaseBot, TriangularLR, WeightDecayOptimizerWrapper ",No,5,23.0 "BERT_MODEL = 'bert-large-uncased' CASED = False",No,5,77.0 "df_train = pd.read_csv(""gap-test.tsv"", delimiter=""\\t"") df_val = pd.read_csv(""gap-validation.tsv"", delimiter=""\\t"") df_test = pd.read_csv(""gap-development.tsv"", delimiter=""\\t"") sample_sub = pd.read_csv(""../input/sample_submission_stage_1.csv"") assert sample_sub.shape[0] == df_test.shape[0]'",No,5,45.0 "tokenizer = BertTokenizer.from_pretrained( BERT_MODEL, do_lower_case=CASED, never_split = (""[UNK]"", ""[SEP]"", ""[PAD]"", ""[CLS]"", ""[MASK]"", ""[A]"", ""[B]"", ""[P]"") ) # These tokens are not actually used, so we can assign arbitrary values. tokenizer.vocab[""[A]""] = -1 tokenizer.vocab[""[B]""] = -1 tokenizer.vocab[""[P]""] = -1",No,4,30.0 "sample = df.sample(random_state=100) display_entry(sample.iloc[0])",No,4,41.0 "import os # This variable is used by helperbot to make the training deterministic os.environ[""SEED""] = ""323"" import logging from pathlib import Path import torch import torch.nn as nn import numpy as np import pandas as pd from tqdm import tqdm_notebook from sklearn.metrics import log_loss from sklearn.model_selection import StratifiedKFold from torch.optim.lr_scheduler import CosineAnnealingLR from torch.utils.data import Dataset, DataLoader, TensorDataset from pytorch_pretrained_bert import BertTokenizer from pytorch_pretrained_bert.modeling import BertModel from allennlp.modules.span_extractors import SelfAttentiveSpanExtractor, EndpointSpanExtractor from helperbot import ( TriangularLR, BaseBot, WeightDecayOptimizerWrapper )",No,5,23.0 "result = pd.DataFrame({'ID': df.index, 'A': 1, 'B': 1, 'NEITHER': 1}) result.to_csv('dummy_all_equal.csv', index=False)",No,4,25.0 "result['A'] = 1 result['B'] = 0 result['NEITHER'] = 0 result.to_csv('dummy_A.csv', index=False)",No,4,25.0 "result['A'] = 0 result['B'] = 1 result['NEITHER'] = 0 result.to_csv('dummy_B.csv', index=False)",No,5,25.0 "result['A'] = 0 result['B'] = 0 result['NEITHER'] = 1 result.to_csv('dummy_NEITHER.csv', index=False)",No,5,25.0 "# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html from sklearn.metrics import log_loss y_true = [""spam"", ""ham"", ""ham"", ""spam""] # The labels in y_pred are assumed to be ordered alphabetically, as done by preprocessing.LabelBinarizer # [""ham"", ""spam""] y_pred = [ [.1, .9], [.9, .1], [.8, .2], [.35, .65] ] log_loss(y_true, y_pred)",No,4,49.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory import os import re print(os.listdir(""../input"")) import spacy import networkx as nx import zipfile sample_submission = pd.read_csv(""../input/gendered-pronoun-resolution/sample_submission_stage_1.csv"") final_test = pd.read_csv(""../input/gendered-pronoun-resolution/test_stage_2.tsv"", sep = ""\\t"") nlp = spacy.load('en_core_web_sm') dep = [""ACL"", ""ACOMP"", ""ADVCL"", ""ADVMOD"", ""AGENT"", ""AMOD"", ""APPOS"", ""ATTR"", ""AUX"", ""AUXPASS"", ""CASE"", ""CC"", ""CCOMP"", ""COMPOUND"", ""CONJ"", ""CSUBJ"", ""CSUBJPASS"", ""DATIVE"", ""DEP"", ""DET"", ""DOBJ"" , ""EXPL"", ""INTJ"", ""MARK"", ""META"", ""NEG"", ""NOUNMOD"", ""NPMOD"", ""NSUBJ"", ""NSUBJPASS"", ""NUMMOD"" , ""OPRD"", ""PARATAXIS"", ""PCOMP"", ""POBJ"", ""POSS"", ""PRECONJ"", ""PREDET"", ""PREP"", ""PRT"", ""PUNCT"", ""QUANTMOD"", ""RELCL"", ""ROOT"", ""XCOMP"", ""COMPLM"",""INFMOD"",""PARTMOD"",""HMOD"",""HYPH"",""IOBJ"",""NUM"", ""NUMBER"",""NMOD"",""NN"",""NPADVMOD"",""POSSESSIVE"",""RCMOD"",""SUBTOK""] # Any results you write to the current directory are saved as output.",No,4,22.0 final_test.shape,No,5,58.0 import tensorflow as tf,No,5,22.0 "train_data = pd.read_csv(""gap-development.tsv"", sep = ""\\t"") validation_data = pd.read_csv(""gap-validation.tsv"", sep = ""\\t"") test_data = pd.read_csv(""gap-test.tsv"", sep = ""\\t"")'",No,5,45.0 " merge_data = pd.concat([train_data,validation_data]).reset_index(drop = True) merge_data = pd.concat([merge_data,train_data]).reset_index(drop = True) count = 0",No,5,11.0 " def name_replace(s, r1, r2): s = str(s).replace(r1,r2) for r3 in r1.split(' '): s = str(s).replace(r3,r2) return s def shortest_dependency_path(doc, e1=None, e2=None): edges = [] for token in doc: for child in token.children: edges.append(('{0}'.format(token), '{0}'.format(child))) graph = nx.Graph(edges) try: shortest_path = nx.shortest_path(graph, source=e1, target=e2) except Exception as e: shortest_path = [e1, e2] print(e) print(doc, e1, e2) return shortest_path def dependency_vector(doc, pronoun, word): vector = [0] * 59 # for token in doc: # if token.text == pronoun: # pi = token.i # elif token.text == word: # wi = token.i # if pi>wi: # for token in doc[wi:pi+1]: # index = dep.index(token.dep_.upper()) # vector[index] = 1 # else: # for token in doc[pi:wi+1]: # index = dep.index(token.dep_.upper()) # vector[index] = 1 # return vector x = shortest_dependency_path(doc, pronoun, word) for token in doc: if token.text in x: val = (x.index(str(token)) + 1) / len(x) try: index = dep.index(token.dep_.upper()) vector[index] = val except: pass return vector def get_features(df): df['A-offset2'] = df['A-offset'] + df['A'].map(len) df['B-offset2'] = df['B-offset'] + df['B'].map(len) df[""Text""] = df.apply(lambda row: name_replace(row[""Text""], row[""A""], ""Noun_1""), axis = 1) df[""Text""] = df.apply(lambda row: name_replace(row[""Text""], row[""B""], ""Noun_2""), axis = 1) new_df = pd.DataFrame([]) new_df[""Pronoun-offset""] = df[""Pronoun-offset""] new_df['A-offset'] = df[""A-offset""] new_df[""B-offset""] = df[""B-offset""] new_df['A-offset2'] = df['A-offset2'] new_df['B-offset2'] = df['B-offset2'] new_df['A_dist'] = (df['Pronoun-offset'] - df['A-offset']).abs() new_df['B_dist'] = (df['Pronoun-offset'] - df['B-offset']).abs() df[""Text""] = df.Text.apply(lambda row: "" and "".join(row.split("". ""))) vectors_A = df.apply(lambda row: dependency_vector(nlp(row[""Text""]), row[""Pronoun""],""Noun_1"") + dependency_vector(nlp(row[""Text""]), row[""Pronoun""],""Noun_2""), axis = 1) print(count) new_df_2 = pd.DataFrame(vectors_A.tolist()) new_df = pd.concat([new_df, new_df_2], axis = 1) return new_df '",Yes,2,53.0 "feature = get_features(merge_data) ",No,5,8.0 feature,No,5,53.0 " Y = merge_data[[""A-coref"", ""B-coref""]] Y.columns = [""A"",""B""] Y[""A""] = Y[""A""].astype(int) Y[""B""] = Y[""B""].astype(int) Y[""NEITHER""] = 1- (Y[""A""] + Y[""B""])",No,4,16.0 "from sklearn import * from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier import xgboost as xgb from xgboost import XGBClassifier from sklearn.preprocessing import StandardScaler x1, x2, y1, y2 = model_selection.train_test_split(feature.fillna(-1), Y, test_size=0.2, random_state=1) x1.head() x2.head() y2",No,4,22.0 "scaler = StandardScaler() x1 = scaler.fit_transform(x1) x2 = scaler.transform(x2) model = multiclass.OneVsRestClassifier(ensemble.RandomForestClassifier(max_depth = 7, n_estimators=1000, random_state=33)) # model = multiclass.OneVsRestClassifier(ensemble.ExtraTreesClassifier(n_jobs=-1, n_estimators=100, random_state=33)) # param_dist = {'objective': 'binary:logistic', 'max_depth': 1, 'n_estimators':1000, 'num_round':1000, 'eval_metric': 'logloss'} # model = multiclass.OneVsRestClassifier(xgb.XGBClassifier(**param_dist)) model.fit(x1, y1) print('log_loss', metrics.log_loss(y2, model.predict_proba(x2)))",No,3,18.0 "final_test = pd.read_csv(""../input/gendered-pronoun-resolution/test_stage_2.tsv"", sep = ""\\t"") feature = get_features(final_test) print(feature) '",No,3,45.0 " feature = feature.fillna(-1) # feature = scaler.transform(feature) print(feature) ",No,4,17.0 " Y = pd.DataFrame(model.predict_proba(feature).tolist(), columns=[""A"",""B"", ""NEITHER""]) r = final_test[[""ID""]] submission = pd.concat([r,Y], axis = 1)",No,3,12.0 "print(submission) submission.to_csv('submission.csv', index=False)",No,4,25.0 !ls ../input,No,5,88.0 "import time import os import random import numpy as np import pandas as pd import torch from torch.optim import Adam from torch.utils.data import Dataset from torch.nn import Module, Linear, Dropout import torch.nn.functional as F from pytorch_pretrained_bert.modeling import BertModel, BertLayer from pytorch_pretrained_bert import BertTokenizer from pytorch_pretrained_bert.optimization import BertAdam from pytorch_pretrained_bert.optimization import warmup_linear from torch.utils.data import DataLoader from torch.utils.data import RandomSampler from sklearn.metrics import log_loss import matplotlib.pyplot as plt",No,5,22.0 "# Model bert_model = ""bert-large-cased"" n_bertlayers = 22 dropout = 0.1 # Preprocessing do_lower_case = False # Training train_batch_size = 4 gradient_accumulation_steps = 5 lr = 1e-5 num_train_epochs = 2 warmup_proportion = 0.1 optim = ""bertadam"" weight_decay = False # Others n_models = 10 eval_batch_size = 32 device = torch.device(""cuda"") data_dir = """"",No,4,59.0 "mf = '../input/fork-of-fork-of-densenet201/model.hdf5' import os os.system('ls '+mf)",No,5,88.0 "plt.figure(figsize=(16,8)) for i,country in enumerate(list_countries): Fatal_diff=Fatal_pivot[(Fatal_pivot[country]>0)][country].diff().fillna(0) Fatal_diff=Fatal_diff[Fatal_diff>0] Fatal_diff.plot(color=colors[i],label=country,lw=5) plt.title('Number of daily new Fatalities',fontsize=15) plt.legend(title='country') plt.tight_layout()",No,5,33.0 "# all prediction is correct y_pred = [ [0., 1.], [1., 0.], [1., 0.], [0., 1.] ] log_loss(y_true, y_pred)",No,5,84.0 "# all prediction is wrong y_pred = [ [1., 0.], [0., 1.], [0., 1.], [1., 0.] ] log_loss(y_true, y_pred)",No,5,49.0 "samples = df.sample(n=10, random_state=100) for _, s in samples.iterrows(): display_entry(s)",No,4,41.0 "import spacy nlp = spacy.load('en_core_web_lg') displacy.render(nlp(samples.iloc[-2]['Text']), style='dep', jupyter=True, options={'distance': 150})",No,4,22.0 "# following: https://www.kaggle.com/keyit92/end2end-coref-resolution-by-attention-rnn/data train_df = read_df(""../input/googlegapcoreference/gap-test.tsv"") test_df = read_df(""../input/googlegapcoreference/gap-development.tsv"") dev_df = read_df(""../input/googlegapcoreference/gap-validation.tsv"")",No,5,45.0 "print(f""Train: {train_df.shape}\ Test: {test_df.shape}\ Development: {dev_df.shape}"")'",No,5,58.0 "sample = train_df.sample(random_state=555) display_entry(sample.iloc[0])",No,5,41.0 "# just testing if there is any entry with more than one answer train_df[train_df[['A-coref', 'B-coref']].sum(axis=1) > 1]",No,4,41.0 "# adding a column with the answer def get_answer(row): if row['A-coref']: return 'A' if row['B-coref']: return 'B' return 'NEITHER' train_df['answer'] = train_df.apply(get_answer, axis=1)",No,5,8.0 train_df['Text-length'].hist(),No,5,33.0 "train_df.groupby(pd.qcut(train_df['Text-length'], q=[0, .25, .5, .75, 1.]))['answer'].value_counts().unstack()",No,5,60.0 "train_df_A = train_df[train_df['answer'] == 'A'] train_df_B = train_df[train_df['answer'] == 'B'] train_df_NEITHER = train_df[train_df['answer'] == 'NEITHER'] X_A_A = train_df_A.rename(columns={ 'A': 'RE', 'A-offset': 'RE-offset', 'A-offset-end': 'RE-offset-end' })[['Text', 'Pronoun', 'RE', 'RE-offset', 'RE-offset-end', 'URL', 'Text-length']] X_A_A['y'] = 1 X_A_A['referred-expression'] = 'A' X_A_B = train_df_A.rename(columns={ 'B': 'RE', 'B-offset': 'RE-offset', 'B-offset-end': 'RE-offset-end' })[['Text', 'Pronoun', 'RE', 'RE-offset', 'RE-offset-end', 'URL', 'Text-length', 'answer']] X_A_B['y'] = 0 X_A_B['referred-expression'] = 'A' X_B_B = train_df_B.rename(columns={ 'B': 'RE', 'B-offset': 'RE-offset', 'B-offset-end': 'RE-offset-end' })[['Text', 'Pronoun', 'RE', 'RE-offset', 'RE-offset-end', 'URL', 'Text-length', 'answer']] X_B_B['y'] = 1 X_B_B['referred-expression'] = 'B' X_B_A = train_df_B.rename(columns={ 'A': 'RE', 'A-offset': 'RE-offset', 'A-offset-end': 'RE-offset-end' })[['Text', 'Pronoun', 'RE', 'RE-offset', 'RE-offset-end', 'URL', 'Text-length', 'answer']] X_B_A['y'] = 0 X_B_A['referred-expression'] = 'B' X_NEITHER_A = train_df_NEITHER.rename(columns={ 'A': 'RE', 'A-offset': 'RE-offset', 'A-offset-end': 'RE-offset-end' })[['Text', 'Pronoun', 'RE', 'RE-offset', 'RE-offset-end', 'URL', 'Text-length', 'answer']] X_NEITHER_A['y'] = 0 X_NEITHER_A['referred-expression'] = 'A' X_NEITHER_B = train_df_NEITHER.rename(columns={ 'B': 'RE', 'B-offset': 'RE-offset', 'B-offset-end': 'RE-offset-end' })[['Text', 'Pronoun', 'RE', 'RE-offset', 'RE-offset-end', 'URL', 'Text-length', 'answer']] X_NEITHER_B['y'] = 0 X_NEITHER_B['referred-expression'] = 'B' X_df = pd.concat((X_A_A, X_A_B, X_B_A, X_B_B, X_NEITHER_A, X_NEITHER_B))",Yes,3,61.0 X_df.shape,No,5,58.0 X_df.sample(random_state=1),No,5,41.0 "from textacy import similarity def add_features(df, re_col, inplace=False): if inplace: df_ = df else: df_ = df.copy() df_['URL_last_part'] = df_['URL'].str.rsplit('/', n=1, expand=True)[1].apply(preprocess_so) df_['URL_distance_jaro_winkler'] = df_.apply(lambda row: similarity.jaro_winkler(row['URL_last_part'], row[re_col]), axis=1) df_['URL_distance_levenshtein'] = df_.apply(lambda row: similarity.levenshtein(row['URL_last_part'], row[re_col]), axis=1) df_['URL_distance_token_sort_ratio'] = df_.apply(lambda row: similarity.token_sort_ratio(row['URL_last_part'], row[re_col]), axis=1) return df_ add_features(X_df, 'RE', inplace=True) X_df.sample(5, random_state=800)[['URL_last_part', 'URL']]",No,4,8.0 "X_df.hist(column='URL_distance_jaro_winkler', by='y', figsize=(20, 5), bins=20, sharey=True)",No,5,33.0 "X_df.hist(column='URL_distance_levenshtein', by='y', figsize=(20, 5), bins=20, sharey=True)",No,5,33.0 "X_df.hist(column='URL_distance_token_sort_ratio', by='y', figsize=(20, 5), bins=20, sharey=True)",No,5,33.0 "from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split X = X_df[['URL_distance_token_sort_ratio', 'URL_distance_levenshtein', 'URL_distance_jaro_winkler', 'Text-length', 'referred-expression']] y = X_df['y'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23) X_train_features = X_train.drop(columns='referred-expression') X_train_referred_expression = X_train['referred-expression'] X_test_features = X_test.drop(columns='referred-expression') X_test_referred_expression = X_test['referred-expression'] lr = LinearRegression(normalize=True).fit(X_train_features, y_train) y_pred_ = lr.predict(X_test_features)",Yes,4,21.0 "import numpy as np def transform_to_submit(y_pred_, referred_expression): y_pred_comp = 1 - y_pred_ all_zero = np.zeros_like(y_pred_).reshape((-1, 1)) y_pred = np.hstack(( np.where(referred_expression == 'A', y_pred_, y_pred_comp).reshape((-1, 1)), np.where(referred_expression == 'B', y_pred_comp, y_pred_).reshape((-1, 1)), all_zero )) return y_pred y_true = np.hstack(( np.where(((X_test_referred_expression == 'A') & (y_test)), 1, 0).reshape((-1, 1)), np.where(((X_test_referred_expression == 'B') & (y_test)), 1, 0).reshape((-1, 1)), np.zeros_like(y_test).reshape((-1, 1)) )) # TODO: refact # one of the ideas is to run the model over all the referred expressions and then calculate the final answer #y_pred_A = lr.predict(df_A).reshape(-1, 1) #y_pred_B = lr.predict(df_B).reshape(-1, 1) #all_zero = np.zeros_like(y_pred_A) #X_test_A = add_features(X_test, 'A', inplace=False)[['URL_distance_token_sort_ratio', 'URL_distance_levenshtein', 'URL_distance_jaro_winkler', 'Text-length']] #X_test_B = add_features(X_test, 'B', inplace=False)[['URL_distance_token_sort_ratio', 'URL_distance_levenshtein', 'URL_distance_jaro_winkler', 'Text-length']] #y_pred = np.hstack((y_pred_A, # y_pred_B, # all_zero # )) #y_true[(np.abs(y_true[:, 0] - y_true[:, 1]) < 0.1) & (y_true[:, 0] < .3), 2] = .5 log_loss(y_true, transform_to_submit(y_pred_, X_test_referred_expression))",No,4,55.0 y_true,No,5,41.0 "X_features = X.drop(columns='referred-expression') X_referred_expression = X['referred-expression'] lr.fit(X_features, y)",Yes,3,7.0 "df_A = add_features(df, 'A', inplace=False)[['URL_distance_token_sort_ratio', 'URL_distance_levenshtein', 'URL_distance_jaro_winkler', 'Text-length']] df_B = add_features(df, 'B', inplace=False)[['URL_distance_token_sort_ratio', 'URL_distance_levenshtein', 'URL_distance_jaro_winkler', 'Text-length']] y_pred_A = lr.predict(df_A).reshape(-1, 1) y_pred_B = lr.predict(df_B).reshape(-1, 1) all_zero = np.zeros_like(y_pred_A) y_pred = np.hstack((y_pred_A, y_pred_B, all_zero )) result = pd.DataFrame(y_pred, index=df.index, columns=['A', 'B', 'NEITHER']) result.loc[((result['A'] - result['B']).abs() < 0.1) & (result['A'] < .3), 'NEITHER'] = .3 result.to_csv('lr_over_URL_similarity.csv')",Yes,3,27.0 "import numpy as np import pandas as pd import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename))",No,5,88.0 "os.environ[""SEED""] = ""420"" import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, BertConfig import re from tqdm import tqdm",No,5,23.0 "df_train = pd.read_csv(""/kaggle/input/gapvalidation/gap-test.tsv"", delimiter=""\\t"") df_val = pd.read_csv(""/kaggle/input/gapvalidation/gap-validation.tsv"", delimiter=""\\t"") df_test = pd.read_csv(""/kaggle/input/gapvalidation/gap-development.tsv"", delimiter=""\\t"") test_2 = pd.read_csv(""/kaggle/input/gendered-pronoun-resolution/test_stage_2.tsv"", delimiter=""\\t"") PRETRAINED_MODEL_NAME = 'bert-large-uncased' bert_path = ""../input/bert-base-uncased/"" tokenizer = BertTokenizer.from_pretrained(bert_path) pad_len = 300'",No,4,45.0 "tokenizer.add_tokens(['[A]', '[B]', '[P]']) def insert_tag(row): to_be_inserted = sorted([ (row[""A-offset""], "" [A] ""), (row[""B-offset""], "" [B] ""), (row[""Pronoun-offset""], "" [P] "") ], key=lambda x: x[0], reverse=True) text = row[""Text""] for offset, tag in to_be_inserted: text = text[:offset] + tag + text[offset:] return text def tokenize(text, tokenizer): entries = {} final_tokens = [] for token in tokenizer.tokenize(text): if token in (""[A]"", ""[B]"", ""[P]""): entries[token] = len(final_tokens) continue final_tokens.append(token) return final_tokens, (entries[""[A]""], entries[""[B]""], entries[""[P]""]) def target(row): if int(row['A-coref']) == 1: return 0 elif int(row['B-coref']) == 1: return 1 else: return 2 """""" The lower part was taken from [PyTorch] BERT + EndpointSpanExtractor + KFold """""" def children(m): return m if isinstance(m, (list, tuple)) else list(m.children()) def set_trainable_attr(m, b): m.trainable = b for p in m.parameters(): p.requires_grad = b def apply_leaf(m, f): c = children(m) if isinstance(m, nn.Module): f(m) if len(c) > 0: for l in c: apply_leaf(l, f) def set_trainable(l, b): apply_leaf(l, lambda m: set_trainable_attr(m, b))'",Yes,3,8.0 "class modified_dataset(torch.utils.data.Dataset): def __init__(self, df, tokenizer): p_text = [] offsets = [] at_mask = [] self.y_lst = df[['A-coref', 'B-coref']].apply(lambda row: target(row), axis = 1) for row in tqdm(range(len(df))): tokens, offset = tokenize(insert_tag(df.iloc[row]), tokenizer) bla = tokenizer.encode_plus(tokens, max_length = pad_len, pad_to_max_length = True, return_token_type_ids = False) p_text.append(bla['input_ids']) at_mask.append(bla['attention_mask']) offsets.append(offset) self.p_text = torch.tensor(p_text) self.offsets = torch.tensor(offsets) self.at_mask = torch.tensor(at_mask) return def __len__(self): return len(self.p_text) def __getitem__(self,item): return self.p_text[item], self.y_lst[item], self.offsets[item], self.at_mask[item] class modified_dataset_test(torch.utils.data.Dataset): def __init__(self, df, tokenizer): p_text = [] offsets = [] at_mask = [] for row in tqdm(range(len(df))): tokens, offset = tokenize(insert_tag(df.iloc[row]), tokenizer) bla = tokenizer.encode_plus(tokens, max_length = pad_len, pad_to_max_length = True, return_token_type_ids = False) p_text.append(bla['input_ids']) at_mask.append(bla['attention_mask']) offsets.append(offset) self.p_text = torch.tensor(p_text) self.offsets = torch.tensor(offsets) self.at_mask = torch.tensor(at_mask) return def __len__(self): return len(self.p_text) def __getitem__(self,item): return self.p_text[item], self.offsets[item], self.at_mask[item] def collate_fun(batch): tmp_lst = list(zip(*batch)) return torch.stack(tmp_lst[0], axis = 0), torch.tensor(tmp_lst[1]), torch.stack(tmp_lst[2], axis = 0), torch.stack(tmp_lst[3], axis = 0) def collate_fun2(batch): tmp_lst = list(zip(*batch)) return torch.stack(tmp_lst[0], axis = 0), torch.stack(tmp_lst[1], axis = 0), torch.stack(tmp_lst[2], axis = 0) train_loader = DataLoader( modified_dataset(df_train, tokenizer), batch_size=18, collate_fn=collate_fun, shuffle=True, drop_last=True, num_workers=2) val_loader = DataLoader( modified_dataset(df_val, tokenizer), batch_size=30, collate_fn=collate_fun, shuffle=False, num_workers=2) test_loader = DataLoader( modified_dataset(df_test, tokenizer), batch_size=30, collate_fn=collate_fun, shuffle=False, num_workers=2) test_2_loader = DataLoader( modified_dataset_test(test_2, tokenizer), batch_size=30, collate_fn=collate_fun2, shuffle=False, num_workers=2)",Yes,3,44.0 "subs = df_weo['Subject Descriptor'].unique()[:-1] df_weo_agg = df_weo[['Country']][df_weo['Country'].duplicated()==False].reset_index(drop=True) for sub in subs[:]: df_tmp = df_weo[['Country', '2019']][df_weo['Subject Descriptor']==sub].reset_index(drop=True) df_tmp = df_tmp[df_tmp['Country'].duplicated()==False].reset_index(drop=True) df_tmp.columns = ['Country', sub] df_weo_agg = df_weo_agg.merge(df_tmp, on='Country', how='left') df_weo_agg.columns = ["""".join (c if c.isalnum() else ""_"" for c in str(x)) for x in df_weo_agg.columns] df_weo_agg.columns df_weo_agg['Country_Region'] = df_weo_agg['Country'] df_weo_agg.head()'",No,3,14.0 "# merge df_traintest5 = pd.merge(df_traintest4, df_weo_agg, on='Country_Region', how='left') print(df_traintest5.shape) df_traintest5.head()",No,3,32.0 "# add Life expectancy # Life expectancy at birth obtained from http://hdr.undp.org/en/data df_life = pd.read_csv(""../input/smokingstats/Life expectancy at birth.csv"") tmp = df_life.iloc[:,1].values.tolist() df_life = df_life[['Country', '2018']] def func(x): x_new = 0 try: x_new = float(x.replace("","", """")) except: # print(x) x_new = np.nan return x_new df_life['2018'] = df_life['2018'].apply(lambda x: func(x)) df_life.head()'",Yes,2,45.0 "df_life = df_life[['Country', '2018']] df_life.columns = ['Country_Region', 'LifeExpectancy']",No,3,14.0 "# merge df_traintest6 = pd.merge(df_traintest5, df_life, on='Country_Region', how='left') print(len(df_traintest6)) df_traintest6.head()",No,4,32.0 "# add additional info from countryinfo dataset df_country = pd.read_csv(""../input/countryinfo/covid19countryinfo.csv"") df_country.head()",No,4,45.0 "df_country['Country_Region'] = df_country['country'] df_country = df_country[df_country['country'].duplicated()==False]",No,3,19.0 print(df_country[df_country['country'].duplicated()].shape),No,5,38.0 df_country[df_country['country'].duplicated()],No,2,38.0 "df_traintest7 = pd.merge(df_traintest6, df_country.drop(['tests', 'testpop', 'country'], axis=1), on=['Country_Region',], how='left') print(df_traintest7.shape) df_traintest7.head()",No,3,32.0 "def encode_label(df, col, freq_limit=0): df[col][pd.isna(df[col])] = 'nan' tmp = df[col].value_counts() cols = tmp.index.values freq = tmp.values num_cols = (freq>=freq_limit).sum() print(""col: {}, num_cat: {}, num_reduced: {}"".format(col, len(cols), num_cols)) col_new = '{}_le'.format(col) df_new = pd.DataFrame(np.ones(len(df), np.int16)*(num_cols-1), columns=[col_new]) for i, item in enumerate(cols[:num_cols]): df_new[col_new][df[col]==item] = i return df_new def get_df_le(df, col_index, col_cat): df_new = df[[col_index]] for col in col_cat: df_tmp = encode_label(df, col) df_new = pd.concat([df_new, df_tmp], axis=1) return df_new df_traintest7['id'] = np.arange(len(df_traintest7)) df_le = get_df_le(df_traintest7, 'id', ['Country_Region', 'Province_State']) df_traintest8 = pd.merge(df_traintest7, df_le, on='id', how='left')'",Yes,3,8.0 "df_traintest8['cases/day'] = df_traintest8['cases/day'].astype(np.float) df_traintest8['fatal/day'] = df_traintest8['fatal/day'].astype(np.float)",No,5,16.0 "# covert object type to float def func(x): x_new = 0 try: x_new = float(x.replace("","", """")) except: # print(x) x_new = np.nan return x_new cols = [ 'Gross_domestic_product__constant_prices', 'Gross_domestic_product__current_prices', 'Gross_domestic_product__deflator', 'Gross_domestic_product_per_capita__constant_prices', 'Gross_domestic_product_per_capita__current_prices', 'Output_gap_in_percent_of_potential_GDP', 'Gross_domestic_product_based_on_purchasing_power_parity__PPP__valuation_of_country_GDP', 'Gross_domestic_product_based_on_purchasing_power_parity__PPP__per_capita_GDP', 'Gross_domestic_product_based_on_purchasing_power_parity__PPP__share_of_world_total', 'Implied_PPP_conversion_rate', 'Total_investment', 'Gross_national_savings', 'Inflation__average_consumer_prices', 'Inflation__end_of_period_consumer_prices', 'Six_month_London_interbank_offered_rate__LIBOR_', 'Volume_of_imports_of_goods_and_services', 'Volume_of_Imports_of_goods', 'Volume_of_exports_of_goods_and_services', 'Volume_of_exports_of_goods', 'Unemployment_rate', 'Employment', 'Population', 'General_government_revenue', 'General_government_total_expenditure', 'General_government_net_lending_borrowing', 'General_government_structural_balance', 'General_government_primary_net_lending_borrowing', 'General_government_net_debt', 'General_government_gross_debt', 'Gross_domestic_product_corresponding_to_fiscal_year__current_prices', 'Current_account_balance', 'pop', 'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'sexratio', 'lung', 'femalelung', 'malelung', 'gdp2019', 'healthexp', 'healthperpop', 'fertility', 'firstcase', 'totalcases', 'activecases', 'newcases', 'deaths', 'newdeaths', 'recovered', 'critical', 'casediv1m', 'deathdiv1m', ] for col in cols: df_traintest8[col] = df_traintest8[col].apply(lambda x: func(x)) print(df_traintest8['pop'].dtype)'",No,3,16.0 df_traintest8[df_traintest8['place_id']=='China/Hubei'].head(),No,5,41.0 "for col in df_traintest8.columns: print(""'{}',"".format(col))'",No,5,71.0 "day_before_valid = 71 # 3-11 day before of validation day_before_public = 78 # 3-18 last day of train day_before_launch = 85 # 4-1 last day before launch",No,2,23.0 "def calc_score(y_true, y_pred): y_true[y_true<0] = 0 score = metrics.mean_squared_error(np.log(y_true.clip(0, 1e10)+1), np.log(y_pred[:]+1))**0.5 return score",No,5,84.0 "# train model to predict fatalities/day # params SEED = 42 params = {'num_leaves': 8, 'min_data_in_leaf': 5, # 42, 'objective': 'regression', 'max_depth': 8, 'learning_rate': 0.02, 'boosting': 'gbdt', 'bagging_freq': 5, # 5 'bagging_fraction': 0.8, # 0.5, 'feature_fraction': 0.8201, 'bagging_seed': SEED, 'reg_alpha': 1, # 1.728910519108444, 'reg_lambda': 4.9847051755586085, 'random_state': SEED, 'metric': 'mse', 'verbosity': 100, 'min_gain_to_split': 0.02, # 0.01077313523861969, 'min_child_weight': 5, # 19.428902804238373, 'num_threads': 6, } ",No,5,59.0 "# train model to predict fatalities/day # features are selected manually based on valid score col_target = 'fatal/day' col_var = [ 'Lat', 'Long', 'days_since_1cases', # 'days_since_10cases', # 'days_since_100cases', # 'days_since_1fatal', # 'days_since_10fatal', 'days_since_100fatal', # 'cases/day_(1-1)', # 'cases/day_(1-7)', # 'cases/day_(8-14)', # 'cases/day_(15-21)', # 'fatal/day_(1-1)', # 'fatal/day_(1-7)', # 'fatal/day_(8-14)', # 'fatal/day_(15-21)', 'SmokingRate', 'Gross_domestic_product__constant_prices', 'Gross_domestic_product__current_prices', 'Gross_domestic_product__deflator', 'Gross_domestic_product_per_capita__constant_prices', 'Gross_domestic_product_per_capita__current_prices', 'Output_gap_in_percent_of_potential_GDP', 'Gross_domestic_product_based_on_purchasing_power_parity__PPP__valuation_of_country_GDP', 'Gross_domestic_product_based_on_purchasing_power_parity__PPP__per_capita_GDP', 'Gross_domestic_product_based_on_purchasing_power_parity__PPP__share_of_world_total', 'Implied_PPP_conversion_rate', 'Total_investment', 'Gross_national_savings', 'Inflation__average_consumer_prices', 'Inflation__end_of_period_consumer_prices', 'Six_month_London_interbank_offered_rate__LIBOR_', 'Volume_of_imports_of_goods_and_services', 'Volume_of_Imports_of_goods', 'Volume_of_exports_of_goods_and_services', 'Volume_of_exports_of_goods', 'Unemployment_rate', 'Employment', 'Population', 'General_government_revenue', 'General_government_total_expenditure', 'General_government_net_lending_borrowing', 'General_government_structural_balance', 'General_government_primary_net_lending_borrowing', 'General_government_net_debt', 'General_government_gross_debt', 'Gross_domestic_product_corresponding_to_fiscal_year__current_prices', 'Current_account_balance', 'LifeExpectancy', # 'pop', 'density', 'medianage', 'urbanpop', 'hospibed', # 'smokers', 'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'sexratio', 'lung', 'femalelung', 'malelung', 'gdp2019', 'healthexp', 'healthperpop', ] col_cat = [] df_train = df_traintest8[(pd.isna(df_traintest8['ForecastId'])) & (df_traintest8['day']<=day_before_valid)] df_valid = df_traintest8[(pd.isna(df_traintest8['ForecastId'])) & (day_before_validday_before_public-2].head()",No,5,14.0 "# remove overlap for private LB prediction df_tmp = df_traintest8[ ((df_traintest8['day']<=day_before_private) & (pd.isna(df_traintest8['ForecastId']))) | ((day_before_privateday_before_private-2].head()",No,4,14.0 "# predict test data in public # predict the cases and fatatilites one day at a time and use the predicts as next day's feature recursively. df_preds = [] for i, place in enumerate(places[:]): df_interest = copy.deepcopy(df_traintest9[df_traintest9['place_id']==place].reset_index(drop=True)) df_interest['cases/day'][(pd.isna(df_interest['ForecastId']))==False] = -1 df_interest['fatal/day'][(pd.isna(df_interest['ForecastId']))==False] = -1 len_known = (df_interest['day']<=day_before_public).sum() len_unknown = (day_before_publicday_before_private] = df_preds_pri[df_preds['day']>day_before_private]",No,5,14.0 "df_preds.to_csv(""df_preds.csv"", index=None)",No,5,25.0 "# load sample submission df_sub = pd.read_csv(""../input/covid19-global-forecasting-week-2/submission.csv"") print(len(df_sub)) df_sub.head()",No,3,45.0 "# merge prediction with sub df_sub = pd.merge(df_sub, df_traintest3[['ForecastId', 'place_id', 'day']]) df_sub = pd.merge(df_sub, df_preds[['place_id', 'day', 'cases_pred', 'fatal_pred']], on=['place_id', 'day',], how='left') df_sub.head(10)",No,4,32.0 "# save df_sub['ConfirmedCases'] = df_sub['cases_pred'] df_sub['Fatalities'] = df_sub['fatal_pred'] df_sub = df_sub[['ForecastId', 'ConfirmedCases', 'Fatalities']] df_sub.to_csv(""submission.csv"", index=None) df_sub.head(10)'",No,3,25.0 %matplotlib inline,Yes,3,22.0 "# Imports # pandas import pandas as pd from pandas import Series,DataFrame # numpy, matplotlib, seaborn import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set_style('whitegrid') %matplotlib inline # machine learning from sklearn.linear_model import LinearRegression from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB import xgboost as xgb",No,5,23.0 "# get rossmann, store, & test csv files as a DataFrame rossmann_df = pd.read_csv(""../input/train.csv"") store_df = pd.read_csv(""../input/store.csv"") test_df = pd.read_csv(""../input/test.csv"") # preview the data rossmann_df.head()",No,4,45.0 "rossmann_df.info() print(""----------------------------"") store_df.info() print(""----------------------------"") test_df.info()",No,5,40.0 "# Open fig, (axis1) = plt.subplots(1,1,figsize=(15,4)) sns.countplot(x='Open',hue='DayOfWeek', data=rossmann_df,palette=""husl"", ax=axis1) # fill NaN values in test_df with Open=1 if DayOfWeek != 7 test_df[""Open""][test_df[""Open""] != test_df[""Open""]] = (test_df[""DayOfWeek""] != 7).astype(int) # Drop Open column # rossmann_df.drop(""Open"", axis=1, inplace=True) # test_df.drop(""Open"", axis=1, inplace=True)'",Yes,2,17.0 "# Date # Create Year and Month columns rossmann_df['Year'] = rossmann_df['Date'].apply(lambda x: int(str(x)[:4])) rossmann_df['Month'] = rossmann_df['Date'].apply(lambda x: int(str(x)[5:7])) test_df['Year'] = test_df['Date'].apply(lambda x: int(str(x)[:4])) test_df['Month'] = test_df['Date'].apply(lambda x: int(str(x)[5:7])) # Assign Date column to Date(Year-Month) instead of (Year-Month-Day) # this column will be useful in analysis and visualization rossmann_df['Date'] = rossmann_df['Date'].apply(lambda x: (str(x)[:7])) test_df['Date'] = test_df['Date'].apply(lambda x: (str(x)[:7])) # group by date and get average sales, and precent change average_sales = rossmann_df.groupby('Date')[""Sales""].mean() pct_change_sales = rossmann_df.groupby('Date')[""Sales""].sum().pct_change() fig, (axis1,axis2) = plt.subplots(2,1,sharex=True,figsize=(15,8)) # plot average sales over time(year-month) ax1 = average_sales.plot(legend=True,ax=axis1,marker='o',title=""Average Sales"") ax1.set_xticks(range(len(average_sales))) ax1.set_xticklabels(average_sales.index.tolist(), rotation=90) # plot precent change for sales over time(year-month) ax2 = pct_change_sales.plot(legend=True,ax=axis2,marker='o',rot=90,colormap=""summer"",title=""Sales Percent Change"") # ax2.set_xticks(range(len(pct_change_sales))) # ax2.set_xticklabels(pct_change_sales.index.tolist(), rotation=90)'",Yes,2,8.0 "# .... contiune with Date # Plot average sales & customers for every year fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4)) sns.barplot(x='Year', y='Sales', data=rossmann_df, ax=axis1) sns.barplot(x='Year', y='Customers', data=rossmann_df, ax=axis2) # Drop Date column # rossmann_df.drop(['Date'], axis=1,inplace=True) # test_df.drop(['Date'], axis=1,inplace=True)",No,5,75.0 "# Customers fig, (axis1,axis2) = plt.subplots(2,1,figsize=(15,8)) # Plot max, min values, & 2nd, 3rd quartile sns.boxplot([rossmann_df[""Customers""]], whis=np.inf, ax=axis1) # group by date and get average customers, and precent change average_customers = rossmann_df.groupby('Date')[""Customers""].mean() # pct_change_customers = rossmann_df.groupby('Date')[""Customers""].sum().pct_change() # Plot average customers over the time # it should be correlated with the average sales over time ax = average_customers.plot(legend=True,marker='o', ax=axis2) ax.set_xticks(range(len(average_customers))) xlabels = ax.set_xticklabels(average_customers.index.tolist(), rotation=90)'",No,3,33.0 "# DayOfWeek # In both cases where the store is closed and opened fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4)) sns.barplot(x='DayOfWeek', y='Sales', data=rossmann_df, order=[1,2,3,4,5,6,7], ax=axis1) sns.barplot(x='DayOfWeek', y='Customers', data=rossmann_df, order=[1,2,3,4,5,6,7], ax=axis2)",No,5,75.0 "# Promo # Plot average sales & customers with/without promo fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4)) sns.barplot(x='Promo', y='Sales', data=rossmann_df, ax=axis1) sns.barplot(x='Promo', y='Customers', data=rossmann_df, ax=axis2)",No,5,33.0 "# StateHoliday # StateHoliday column has values 0 & ""0"", So, we need to merge values with 0 to ""0"" rossmann_df[""StateHoliday""].loc[rossmann_df[""StateHoliday""] == 0] = ""0"" # test_df[""StateHoliday""].loc[test_df[""StateHoliday""] == 0] = ""0"" # Plot sns.countplot(x='StateHoliday', data=rossmann_df) # Before fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4)) sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df, ax=axis1) mask = (rossmann_df[""StateHoliday""] != ""0"") & (rossmann_df[""Sales""] > 0) sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df[mask], ax=axis2)'",No,5,33.0 "# .... continue with StateHoliday # After rossmann_df[""StateHoliday""] = rossmann_df[""StateHoliday""].map({0: 0, ""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1}) test_df[""StateHoliday""] = test_df[""StateHoliday""].map({0: 0, ""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1}) fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4)) sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df, ax=axis1) sns.barplot(x='StateHoliday', y='Customers', data=rossmann_df, ax=axis2)'",No,5,33.0 "# SchoolHoliday # Plot sns.countplot(x='SchoolHoliday', data=rossmann_df) fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4)) sns.barplot(x='SchoolHoliday', y='Sales', data=rossmann_df, ax=axis1) sns.barplot(x='SchoolHoliday', y='Customers', data=rossmann_df, ax=axis2)",No,5,33.0 "# Sales fig, (axis1,axis2) = plt.subplots(2,1,figsize=(15,8)) # Plot max, min values, & 2nd, 3rd quartile sns.boxplot([rossmann_df[""Customers""]], whis=np.inf, ax=axis1) # Plot sales values # Notice that values with 0 is mostly because the store was closed rossmann_df[""Sales""].plot(kind='hist',bins=70,xlim=(0,15000),ax=axis2)'",No,5,33.0 "# Using store_df # Merge store_df with average store sales & customers average_sales_customers = rossmann_df.groupby('Store')[[""Sales"", ""Customers""]].mean() sales_customers_df = DataFrame({'Store':average_sales_customers.index, 'Sales':average_sales_customers[""Sales""], 'Customers': average_sales_customers[""Customers""]}, columns=['Store', 'Sales', 'Customers']) store_df = pd.merge(sales_customers_df, store_df, on='Store') store_df.head()'",No,3,8.0 "# StoreType # Plot StoreType, & StoreType Vs average sales and customers sns.countplot(x='StoreType', data=store_df, order=['a','b','c', 'd']) fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4)) sns.barplot(x='StoreType', y='Sales', data=store_df, order=['a','b','c', 'd'],ax=axis1) sns.barplot(x='StoreType', y='Customers', data=store_df, order=['a','b','c', 'd'], ax=axis2)",No,5,33.0 "# Assortment # Plot Assortment, & Assortment Vs average sales and customers sns.countplot(x='Assortment', data=store_df, order=['a','b','c']) fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4)) sns.barplot(x='Assortment', y='Sales', data=store_df, order=['a','b','c'], ax=axis1) sns.barplot(x='Assortment', y='Customers', data=store_df, order=['a','b','c'], ax=axis2)",No,5,33.0 "# Promo2 # Plot Promo2, & Promo2 Vs average sales and customers sns.countplot(x='Promo2', data=store_df) fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4)) sns.barplot(x='Promo2', y='Sales', data=store_df, ax=axis1) sns.barplot(x='Promo2', y='Customers', data=store_df, ax=axis2)",No,5,33.0 "# CompetitionDistance # fill NaN values store_df[""CompetitionDistance""].fillna(store_df[""CompetitionDistance""].median()) # Plot CompetitionDistance Vs Sales store_df.plot(kind='scatter',x='CompetitionDistance',y='Sales',figsize=(15,4)) store_df.plot(kind='kde',x='CompetitionDistance',y='Sales',figsize=(15,4))'",Yes,3,33.0 "# What happened to the average sales of a store over time when competition started? # Example: the average sales for store_id = 6 has dramatically decreased since the competition started store_id = 6 store_data = rossmann_df[rossmann_df[""Store""] == store_id] average_store_sales = store_data.groupby('Date')[""Sales""].mean() # Get year, and month when Competition started y = store_df[""CompetitionOpenSinceYear""].loc[store_df[""Store""] == store_id].values[0] m = store_df[""CompetitionOpenSinceMonth""].loc[store_df[""Store""] == store_id].values[0] # Plot ax = average_store_sales.plot(legend=True,figsize=(15,4),marker='o') ax.set_xticks(range(len(average_store_sales))) ax.set_xticklabels(average_store_sales.index.tolist(), rotation=90) # Since all data of store sales given in rossmann_df starts with year=2013 till 2015, # So, we need to check if year>=2013 and y & m aren't NaN values. if y >= 2013 and y == y and m == m: plt.axvline(x=((y-2013) * 12) + (m - 1), linewidth=3, color='grey')'",No,4,33.0 "# Risk Analysis # Analyze the risk of a store; Risk(std) Vs Expected(mean) # .... countiue using store_data store_average = store_data[""Sales""].mean() store_std = store_data[""Sales""].std() # Plot plt.scatter(store_average, store_std,alpha = 0.5,s =np.pi*20) # Get min & max mean and std of store sales # Remember that store_df[""Sales""] has the average sales for a store std_sales = rossmann_df.groupby('Store')[""Sales""].std() min_average = store_df[""Sales""].min() max_average = store_df[""Sales""].max() min_std = std_sales.min() max_std = std_sales.max() # Set the x and y limits of the plot plt.ylim([min_std, max_std]) plt.xlim([min_average, max_average]) # Set the plot axis titles plt.xlabel('Expected Sales') plt.ylabel('Risk') # Set label label, x, y = ""Store {}"".format(store_id), store_average, store_std plt.annotate( label, xy = (x, y), xytext = (50, 50), textcoords = 'offset points', ha = 'right', va = 'bottom', arrowprops = dict(arrowstyle = '-', connectionstyle = 'arc3,rad=-0.3'))'",No,4,33.0 "# .... continue Correlation # Plot correlation between range of stores start_store = 1 end_store = 5 fig, (axis1) = plt.subplots(1,1,figsize=(15,5)) # using summation of sales values for each store sns.heatmap(store_piv[list(range(start_store, end_store+1))].corr(),annot=True,linewidths=2) # using percent change for each store # sns.heatmap(store_pct_chage[list(range(start_store, end_store+1))].corr(),annot=True,linewidths=2)",No,5,80.0 "# Notice that test_df has only year=2015, and months 8 & 9 # drop Year and Month rossmann_df.drop([""Year"", ""Month""], axis=1, inplace=True) test_df.drop([""Year"", ""Month""], axis=1, inplace=True) # Create dummy varibales for DayOfWeek day_dummies_rossmann = pd.get_dummies(rossmann_df['DayOfWeek'], prefix='Day') day_dummies_rossmann.drop(['Day_7'], axis=1, inplace=True) day_dummies_test = pd.get_dummies(test_df['DayOfWeek'],prefix='Day') day_dummies_test.drop(['Day_7'], axis=1, inplace=True) rossmann_df = rossmann_df.join(day_dummies_rossmann) test_df = test_df.join(day_dummies_test) rossmann_df.drop(['DayOfWeek'], axis=1,inplace=True) test_df.drop(['DayOfWeek'], axis=1,inplace=True)'",No,3,8.0 "# remove all rows(store,date) that were closed rossmann_df = rossmann_df[rossmann_df[""Open""] != 0] # drop unnecessary columns, these columns won't be useful in prediction rossmann_df.drop([""Open"",""Customers"", ""Date""], axis=1, inplace=True)'",No,3,10.0 "# save ids of closed stores, because we will assign their sales value to 0 later(see below) closed_store_ids = test_df[""Id""][test_df[""Open""] == 0].values # remove all rows(store,date) that were closed test_df = test_df[test_df[""Open""] != 0] # drop unnecessary columns, these columns won't be useful in prediction test_df.drop(['Open', 'Date'], axis=1,inplace=True)'",Yes,2,14.0 "# Loop through each store, # train the model using the data of current store, and predict it's sales values. rossmann_dic = dict(list(rossmann_df.groupby('Store'))) test_dic = dict(list(test_df.groupby('Store'))) submission = Series() scores = [] for i in test_dic: # current store store = rossmann_dic[i] # define training and testing sets X_train = store.drop([""Sales"",""Store""],axis=1) Y_train = store[""Sales""] X_test = test_dic[i].copy() store_ids = X_test[""Id""] X_test.drop([""Id"",""Store""], axis=1,inplace=True) # Linear Regression lreg = LinearRegression() lreg.fit(X_train, Y_train) Y_pred = lreg.predict(X_test) scores.append(lreg.score(X_train, Y_train)) # Xgboost # params = {""objective"": ""reg:linear"", ""max_depth"": 10} # T_train_xgb = xgb.DMatrix(X_train, Y_train) # X_test_xgb = xgb.DMatrix(X_test) # gbm = xgb.train(params, T_train_xgb, 100) # Y_pred = gbm.predict(X_test_xgb) # append predicted values of current store to submission submission = submission.append(Series(Y_pred, index=store_ids)) # append rows(store,date) that were closed, and assign their sales value to 0 submission = submission.append(Series(0, index=closed_store_ids)) # save to csv file submission = pd.DataFrame({ ""Id"": submission.index, ""Sales"": submission.values}) submission.to_csv('rossmann.csv', index=False)'",Yes,2,7.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline plt.style.use('ggplot') plt.rcParams['figure.figsize'] = (15, 6)",No,5,23.0 "df = pd.read_csv(""../input/train.csv"")",No,5,45.0 df.head().T,No,5,41.0 df.shape,No,5,58.0 "print (df.Store.unique()) print (df.Store.nunique())",No,3,54.0 df['Year'] = df.Date.map(lambda x: int(x[:4])),No,5,16.0 df['Month'] =df.Date.map(lambda x: int(x[5:7])),No,5,16.0 df.Promo.unique(),No,5,57.0 df.StateHoliday.unique(),No,5,57.0 df.groupby(['StateHoliday'])['Open'].agg('mean'),No,5,60.0 "df.StateHoliday = df['StateHoliday'].map(lambda x: (int)(x in ['a','b','c']))",No,5,20.0 df.groupby('StateHoliday')['Open'].mean(),No,5,60.0 df.SchoolHoliday.unique(),No,5,57.0 "plt.subplot('121') df['Sales'].hist(bins=100) plt.subplot('122') df['Customers'].hist(bins=100)",No,5,33.0 df.DayOfWeek.unique(),No,5,57.0 df.DayOfWeek[df.Open == 1].value_counts(),No,3,54.0 y = df.groupby('DayOfWeek')['Sales'].sum(),No,5,60.0 "sns.barplot(x=np.arange(7)+1, y=y)",No,5,33.0 "xx = df.groupby(['Promo', 'Month'])['Sales'].sum() xx",No,3,60.0 "plt.plot(xx[0], 'g', label='no promo') plt.plot(xx[1], 'b', label='promo')",No,5,33.0 "yms = df.groupby(['Year', 'Month'])['Sales'].sum()",No,5,60.0 "sns.barplot(x=np.arange(12)+1, y=yms[2013].sort_index())",No,5,33.0 "sns.barplot(x=np.arange(12)+1, y=yms[2014].sort_index())",No,5,33.0 "sns.barplot(x=np.arange(12)+1, y=yms[2014].sort_index() + yms[2013].sort_index())",No,5,33.0 "df_store = pd.read_csv(""../input/store.csv"")",No,5,45.0 df_store.head().T,No,5,41.0 df_store.StoreType.unique(),No,5,57.0 df_store.Assortment.unique(),No,5,57.0 "from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelEncoder",No,5,22.0 "df_store.fillna(value=0, inplace=True)",No,5,17.0 le = LabelEncoder(),No,2,4.0 df_store.StoreType = le.fit_transform(df_store.StoreType),No,5,20.0 df_store.Assortment = le.fit_transform(df_store.Assortment),No,5,20.0 df_store.PromoInterval.unique(),No,5,57.0 "dict1 = {0: 0, 'Jan,Apr,Jul,Oct': 1, 'Feb,May,Aug,Nov': 2, 'Mar,Jun,Sept,Dec': 3}",No,2,23.0 "df_store.PromoInterval = df_store.PromoInterval.apply(lambda x: dict1[x], convert_dtype=False)",No,4,8.0 "df_store.drop('Store', axis=1, inplace=True)",No,5,10.0 "df = df.join(df_store, on='Store')",No,5,32.0 df.PromoInterval = df.PromoInterval.map(lambda x: int(x)),No,5,16.0 print (df['StoreType'].value_counts() / len(df)),No,5,72.0 print (df['Assortment'].value_counts() / len(df)),No,5,72.0 "sales8 = df.groupby(['Store', 'Year', 'Month'])['Sales'].mean()[8]",No,5,60.0 "plt.plot(range(31), sales8.values) plt.plot([df.loc[7].CompetitionOpenSinceMonth+12, df.loc[7].CompetitionOpenSinceMonth+12], [sales8.min(), sales8.max()])",No,5,33.0 "df_t = df.drop(['Customers', 'Date', 'Year'], axis=1)",No,5,10.0 df_t.head(10).T,No,5,41.0 df_t = df_t[df_t.Open == 1],No,5,14.0 df_t.shape,No,5,58.0 "df_t.drop('Open', axis=1, inplace=True)",No,5,10.0 "from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV from sklearn.model_selection import validation_curve",No,5,22.0 "b""X_train, X_test, y_train, y_test = train_test_split(df_t.drop('Sales', axis=1).values, df_t.Sales.values,\\\n test_size = 0.2)""",No,5,13.0 from sklearn.ensemble import RandomForestRegressor,No,5,22.0 "params = range(50, 201, 50)",No,3,5.0 "b""scores, tst_scr = validation_curve(RandomForestRegressor(), X_train[:20000],\\\n y_train[:20000], 'n_estimators', params, \\\n cv=5, scoring='r2', verbose=2)""",No,4,6.0 "scores_mean = scores.mean(axis=1) scores_std = scores.std(axis=1) tst_scr_mean = tst_scr.mean(axis=1) tst_scr_std = tst_scr.std(axis=1) plt.plot(params, tst_scr_mean) plt.fill_between(params, tst_scr_mean + tst_scr_std, tst_scr_mean - tst_scr_std, alpha=0.3) plt.plot(params, scores_mean) plt.fill_between(params, scores_mean + scores_std, scores_mean - scores_std, alpha=0.3)",No,3,33.0 "params = range(3, 9)",No,5,5.0 "b""scores, tst_scr = validation_curve(RandomForestRegressor(n_estimators=100), X_train[:20000], \\\n y_train[:20000], 'max_features', params, \\\n cv=3, scoring='r2', verbose=2)""",No,5,6.0 "params = range(5, 51, 5)",No,5,5.0 "b""scores, tst_scr = validation_curve(RandomForestRegressor(n_estimators=100), X_train[:20000], \\\n y_train[:20000], 'max_depth', params, \\\n cv=3, scoring='r2', verbose=2)""",No,4,6.0 "model1 = RandomForestRegressor(n_estimators=100, max_depth=20, n_jobs=4, verbose=2)",No,5,4.0 "model1.fit(X_train, y_train)",No,5,7.0 idx = model1.feature_importances_.argsort()[::-1],No,4,79.0 "ax = sns.barplot(x=df_t.drop('Sales', axis=1).columns[idx], y=model1.feature_importances_[idx]) _ = plt.setp(ax.get_xticklabels(), rotation=-90)",No,5,79.0 y_pred = model1.predict(X_test),No,5,48.0 "from sklearn.metrics import mean_absolute_error from sklearn.metrics import r2_score",No,5,22.0 "mean_absolute_error(y_test, y_pred)",No,5,49.0 "r2_score(y_test, y_pred)",No,5,49.0 df_test = pd.read_csv('../input/test.csv'),No,5,45.0 df_test['Year'] = df_test.Date.map(lambda x: int(x[:4])),No,5,8.0 df_test['Month'] = df_test.Date.map(lambda x: int(x[5:7])),No,5,8.0 "df_test.StateHoliday = df_test['StateHoliday'].map(lambda x: (int)(x in ['a','b','c']))",No,4,8.0 "df_test = df_test.join(df_store, on='Store')",No,5,32.0 "df_test.drop(['Id', 'Date', 'Year'], axis=1, inplace=True)",No,5,10.0 "df_test.fillna(0, inplace=True)",No,5,17.0 df_test.head().T,No,5,41.0 df_test.Open.unique(),No,5,57.0 "ind_open = df_test.Open == 1 ind_closed = df_test.Open == 0 df_test2 = df_test[ind_open]",No,5,14.0 "df_test2.drop('Open', axis=1, inplace=True)",No,5,10.0 df_test2.head().T,No,5,41.0 X_out = df_test2.values,No,2,16.0 y_out = model1.predict(X_out),No,5,48.0 "df_out = pd.DataFrame(np.zeros(len(df_test)), columns=['Sales'])",No,5,12.0 "df_out[ind_open] = y_out.reshape(-1,1)",No,5,84.0 "df_out.set_index(np.arange(len(df_out))+1, inplace=True)",No,5,55.0 df_out.index.name = 'Id',No,5,55.0 df_out.head(),No,5,41.0 df_out.to_csv('out5.csv'),No,5,25.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline",No,5,23.0 "df_store = pd.read_csv('../input/store.csv', index_col='Store')",No,5,45.0 df_store.head(),No,5,41.0 df_store.info(),No,5,40.0 "categorial_features = ['StoreType', 'Assortment', 'PromoInterval']",No,1,23.0 "plt.figure(figsize=(20,16)) for i,country in enumerate(list_countries): Fatal_diff=Fatal_pivot[(Fatal_pivot[country]>0)][country].diff().fillna(0) Fatal_diff=Fatal_diff[Fatal_diff>0] plt.subplot(3,4,i+1) Fatal_diff.plot(color=colors[i],label=country.upper(),lw=5) plt.xticks(rotation=60) plt.title('Number of daily new Fatalities in {}'.format(country.upper())) plt.legend(title='Country') plt.tight_layout() ",No,5,33.0 "from sklearn.preprocessing import OneHotEncoder, LabelEncoder for p in categorial_features: X_int = LabelEncoder().fit_transform(df_store[p].values.astype(str)).reshape(-1,1) ohe_feat = OneHotEncoder(sparse=False).fit_transform(X_int) tmp = pd.DataFrame(ohe_feat, columns=['{0}='.format(p) + str(i) for i in df_store[p].unique()], index=df_store.index, dtype=int) df_store = pd.concat([df_store, tmp], axis=1) df_store = df_store.drop(p, axis=1)",No,4,8.0 "for p in ['CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear']: df_store.loc[:, p] = (df_store[p] - df_store[p].mean()) / df_store[p].std()",No,4,18.0 "# Understanding New cases confirmation variations on daily basis plt.figure(figsize=(20,16)) for i,country in enumerate(list_countries): plt.subplot(4,3,i+1) train_df[(train_df['Country_Region']==country)&(train_df['ConfirmedCases']!=0)].groupby('Date')['ConfirmedCases'].sum().diff().diff().plot(color=colors[i]) plt.ylabel('Difference in Daily reporting cases ') plt.title('Variation of {}'.format(country),va='bottom') plt.suptitle('Variation in number of confirmed cases on daily basis',fontsize=24,va='baseline')",No,5,33.0 "plt.figure(figsize=(16,8)) plt.title('Confirmed Cases trend from first day of incidence') for i,country in enumerate(list_countries): confirm_group=train_df[(train_df['Country_Region']==country)&train_df['ConfirmedCases']!=0].groupby('Date').agg({'ConfirmedCases':['sum']}) confirm_value=[j for j in confirm_group.ConfirmedCases['sum'].values] plot_value=confirm_value[0:60] plt.plot(plot_value,color=colors[i],label=country,lw=2) plt.legend(title='Countries')",No,5,33.0 "from sklearn.manifold import TSNE model = TSNE() arr = model.fit_transform(df_store.fillna(0)) plt.scatter(arr[:, 0], arr[:, 1])",No,3,33.0 "plt.figure(figsize=(16,10)) plt.title('Fatalities trend from first day of incidence') for i,country in enumerate(list_countries): fatal_group=train_df[(train_df['Country_Region']==country)&train_df['ConfirmedCases']!=0].groupby('Date').agg({'Fatalities':['sum']}) fatal_value=[j for j in fatal_group.Fatalities['sum'].values] plot_value=fatal_value[0:60] plt.plot(plot_value,color=colors[i],label=country,lw=2) plt.legend(title='Countries')",No,5,33.0 "from sklearn.cluster import AgglomerativeClustering from sklearn.metrics import silhouette_score scores = [] ns = list(range(2, 10)) + list(range(10, 30, 5)) for n in ns: agc = AgglomerativeClustering(n_clusters=n) store_cluster = agc.fit_predict(df_store.fillna(0)).reshape(-1,1) scores.append(silhouette_score(df_store.fillna(0), store_cluster.ravel())) plt.plot(ns, scores)",No,2,6.0 "agc = AgglomerativeClustering(n_clusters=9) store_cluster = agc.fit_predict(df_store.fillna(1)) store_cluster.shape",No,3,7.0 "from sklearn.manifold import TSNE model = TSNE() arr = model.fit_transform(df_store.fillna(1)) plt.scatter(arr[:, 0], arr[:, 1], c=store_cluster)",No,3,56.0 "print(df_train.shape) df_train.head()",No,3,45.0 df_train.StateHoliday.unique(),No,5,57.0 "df_train.replace({'StateHoliday': {0: '0'}}, inplace=True) df_train.StateHoliday.unique()",No,5,57.0 "SH_int = LabelEncoder().fit_transform(df_train.StateHoliday.values.astype(str)).reshape(-1,1) ohe_feat = OneHotEncoder(sparse=False).fit_transform(SH_int) tmp = pd.DataFrame(ohe_feat, columns=['SH='+ str(i) for i in df_train.StateHoliday.unique()], index=df_train.index, dtype=int) df_train = df_train.drop('StateHoliday', axis=1) df_train = pd.concat([df_train, tmp], axis=1)",No,2,8.0 "ohe_feat = OneHotEncoder(sparse=False).fit_transform(df_train.DayOfWeek.values.reshape(-1,1)) tmp = pd.DataFrame(ohe_feat, columns=['DayOfWeek=' + str(i) for i in df_train.DayOfWeek.unique()], index=df_train.index, dtype=int) df_train = df_train.drop('DayOfWeek', axis=1) df_train = pd.concat([df_train, tmp], axis=1)",No,3,8.0 "print(df_train.shape, df_train.columns)",No,3,8.0 "df_train['label'] = pd.Series([store_cluster[ind - 1] for ind in df_train.index], index=df_train.index) y_train = df_train[df_train.Open != 0].Sales.values X_train = df_train[df_train.Open != 0].drop(['Date', 'Sales', 'Customers'], axis=1).values",No,2,58.0 "from sklearn.linear_model import Ridge from sklearn.metrics import accuracy_score rlnrs = {} for i, c in enumerate(np.unique(store_cluster)): df_c = df_train[df_train.label == c] if df_c.shape[0] == 0: continue X_c = df_c.drop(['Date', 'Sales', 'Customers'], axis=1).values y_c = df_c.Sales.values rlnr = Ridge() rlnr.fit(X_c, y_c) rlnrs.update({c: rlnr}) print(c, rlnr.score(X_c, y_c))",No,3,21.0 "df_test = pd.read_csv('../input/test.csv', index_col='Id')",No,2,7.0 print(df_test.Open.unique()),No,5,57.0 "df_test.loc[df_test.Open == 0, 'Sales'] = 0 df_test.Open = df_test.loc[:, 'Open'].fillna(1)",No,5,17.0 "df_test.replace({'StateHoliday': {0: '0'}}, inplace=True) SH_int = LabelEncoder().fit_transform(df_test.StateHoliday.values.astype(str)).reshape(-1,1) ohe_feat = OneHotEncoder(sparse=False).fit_transform(SH_int) tmp = pd.DataFrame(ohe_feat, columns=['SH='+ str(i) for i in df_test.StateHoliday.unique()], index=df_test.index, dtype=int) df_test = df_test.drop('StateHoliday', axis=1) df_test = pd.concat([df_test, tmp], axis=1) df_test['SH=b'] = 0 df_test['SH=c'] = 0 ohe_feat = OneHotEncoder(sparse=False).fit_transform(df_test.DayOfWeek.values.astype(str).reshape(-1,1)) tmp = pd.DataFrame(ohe_feat, columns=['DayOfWeek=' + str(i) for i in df_test.DayOfWeek.unique()], index=df_test.index, dtype=int) df_test = df_test.drop('DayOfWeek', axis=1) df_test = pd.concat([df_test, tmp], axis=1) df_test['label'] = pd.Series([store_cluster[ind - 1] for ind in df_test.Store], index=df_test.index) df_test = df_test.fillna(0) X_test = df_test.drop(['Store', 'Date'], axis=1).values",No,3,57.0 "plt.figure(figsize=(16,8)) plt.subplot(1,2,1) train_df.groupby('Date')['ConfirmedCases'].sum().plot(color='blue') plt.ylabel('Number of Confirmed Cases') plt.title('Confirmed Cases worldwide trend') plt.subplot(1,2,2) train_df.groupby('Date')['Fatalities'].sum().plot(color='r') plt.ylabel('Number of Fatalities') plt.title(""Fatalities worldwide trend"") plt.tight_layout()'",No,5,33.0 "# Confirmed Cases and Fatalities without China's data plt.figure(figsize=(16,8)) plt.subplot(1,2,1) train_df[(train_df['Country_Region']!='China')&(train_df['ConfirmedCases']!=0)].groupby('Date')['ConfirmedCases'].sum().plot(color='blue') plt.ylabel('Number of Confirmed Cases') plt.title('Confirmed Cases worldwide trend(without China)') plt.subplot(1,2,2) train_df[(train_df['Country_Region']!='China')&(train_df['Fatalities']!=0)].groupby('Date')['Fatalities'].sum().plot(color='red') plt.ylabel('Number of Fatalities') plt.title(""Fatalities worldwide trend(without China)"") plt.tight_layout() '",No,5,33.0 countries=train_df['Country_Region'].unique(),No,5,57.0 "country_list=[] confirmation_list=[] list_fatality=[] for country in countries: country_list.append(country) confirm_country=train_df[train_df.Country_Region==country].groupby('Date')['ConfirmedCases'].sum().max() confirmation_list.append(confirm_country) fatal_country=train_df[train_df.Country_Region==country].groupby('Date')['Fatalities'].sum().max() list_fatality.append(fatal_country) max_dict={'Country':country_list,'ConfirmedCases':confirmation_list,'Fatalities':list_fatality} map_df=pd.DataFrame.from_dict(max_dict)",Yes,2,8.0 map_df,No,5,41.0 code_df=pd.read_csv('../input/countrycodes/country-codes.csv'),No,5,45.0 "code_df=code_df[['ISO3166-1-Alpha-3','CLDR display name']]",No,5,10.0 "map_df=map_df.merge(code_df,left_on='Country',right_on='CLDR display name')",No,5,32.0 "map_df.drop('CLDR display name',axis=1,inplace=True)",No,5,10.0 "map_df.rename(columns={'ISO3166-1-Alpha-3':'Country Code'},inplace=True)",No,5,61.0 " from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot init_notebook_mode(connected=True) data=go.Choropleth( locations=map_df['Country Code'], # Spatial coordinates z = map_df['ConfirmedCases'], # Data to be color-coded, colorscale = 'Reds', text=map_df['Country'], colorbar_title = ""Number of Confirmed Cases"",) fig=go.Figure(data) fig.update_layout( title='Covid-19 Confirmed Cases', geo=dict(showframe=False, projection={'type':'robinson'})) iplot(fig)'",No,5,33.0 test_df['Date']=pd.to_datetime(test_df['Date']),No,5,16.0 "test_df['Province_State']=test_df.drop('Province_State',axis=1)",No,5,10.0 train_df=train_df.reset_index(),No,2,10.0 "def error_metrics(model, predictions, y_test): print(""Model: "", model) # The mean squared error print(""--Mean squared error: %.2f"" % mean_squared_error(y_test, predictions)) # RMS print('--Root Mean Squared Error: %.2f' % np.sqrt(metrics.mean_squared_error(y_test, predictions))) # Explained variance score: 1 is perfect prediction print('--Variance score: %.2f' % r2_score(y_test, predictions))'",No,5,84.0 "# Take a look at some of the results def inspect_df(predictions, y_test): true_vs_pred = np.vstack((predictions, y_test)) true_df = pd.DataFrame(true_vs_pred) true_df = true_df.transpose() true_df.columns = [""Predicted"", ""Actual""] return true_df",No,5,12.0 "from IPython.display import display_html def display_side_by_side(*args): html_str='' for df in args: html_str+=df.to_html() display_html(html_str.replace('table','table style=""display:inline""'),raw=True)'",No,5,53.0 "ridge_pred, y_test = linear_models(x, y, ""ridge"") lasso_pred, y_test = linear_models(x, y, ""lasso"") xgb_pred, y_test = linear_models(x, y, ""xgb"") lgb_pred, y_test = linear_models(x, y, ""catboost"")",No,5,53.0 "error_metrics(""Ridge"", ridge_pred, y_test) error_metrics(""Lasso"", lasso_pred, y_test) error_metrics(""xgboost regression"", xgb_pred, y_test) error_metrics(""catboost regression"", lgb_pred, y_test)",No,5,49.0 "X_train, X_test, y_train, y_test = train_test_split(x, y, random_state = 42) model = CatBoostRegressor(random_state = 42) """"""grid = {'learning_rate': [0.03, 0.1], 'depth': [4, 6, 10], 'l2_leaf_reg': [1, 3, 5, 7, 9], #'num_leaves' : [5, 15, 31, 60], 'bagging_temperature' : [0, 1, 5, 10]} grid_search_model = model.grid_search(grid, X=X_train, y=y_train, cv = 5 )"""""" '",No,3,13.0 "model = CatBoostRegressor(num_leaves = 31, bagging_temperature= 0, depth = 6, l2_leaf_reg = 1, learning_rate = 0.03).fit(X_train, y_train) predictions = model.predict(X_test) error_metrics(""Catboost regression"", predictions, y_test)",No,3,7.0 print(model.get_feature_importance(prettified = True)),No,3,79.0 test.columns,No,5,71.0 "submit = pd.DataFrame() submit['ForecastId'] = test['ForecastId'] test.drop([""ForecastId""], axis = 1, inplace = True)'",No,4,12.0 submit,No,5,41.0 fatilities = model.predict(test),No,5,48.0 "model = CatBoostRegressor(num_leaves = 31, bagging_temperature= 0, depth = 4, l2_leaf_reg = 5, learning_rate = 0.1).fit(X_train, y_train) predictions = model.predict(X_test) error_metrics(""Catboost regression"", predictions, y_test)",No,4,79.0 confirmedCases = model.predict(test),Yes,5,48.0 "submit['ConfirmedCases'] = confirmedCases submit['Fatalities'] = fatilities",No,5,55.0 "submit.to_csv('submission.csv',index=False)",No,5,25.0 "import numpy as np import pandas as pd from sklearn.linear_model import LinearRegression",No,5,22.0 "test = pd.read_csv(""../input/test.csv"",parse_dates=[3],index_col=""Id"",dtype={""StateHoliday"":np.str}) train = pd.read_csv(""../input/train.csv"",parse_dates=[2],dtype={""StateHoliday"":np.str}) print(test.dtypes) # Kaggle doesn't seem to support Python 2.7. Oh well. print(train.dtypes)'",No,4,45.0 "set(test.Store.values).issubset(train.Store.values) # if true, I can predict sales per store, not the whole model with store info joined",No,5,53.0 "train = train.loc[train.Sales > 0] # Any day and store with 0 sales is ignored in scoring => no reason to predict them, either",No,5,14.0 "def prepare(df): # transform the date into something human-meaningful df['Year'] = pd.DatetimeIndex(df.Date).year df['Month'] = pd.DatetimeIndex(df.Date).month df['Day'] = pd.DatetimeIndex(df.Date).day # encode StateHolidays into numbers # Since there are only 'a' state holidays in test set, I can probably map a, b, c into 1 df[df.StateHoliday != '0'] = 1 df.StateHoliday = pd.to_numeric(df.StateHoliday) return df",No,5,21.0 "train = prepare(train); test = prepare(test); print(train.dtypes) print(test.dtypes)",No,4,70.0 "# Curses! NA! Foiled again! test.iloc[pd.isnull(test).any(1).nonzero()]",No,5,14.0 "test_nona = test.dropna().copy() stores = set(test_nona.Store.values) test_nona['Sales'] = 0 # create a column to be filled",No,5,17.0 "columns = ['DayOfWeek','Open','Promo','SchoolHoliday','Year','Month','Day','StateHoliday'] # Customers are not present in test, not worth using for store in stores: # takes *FOREVER* to run # pandas throws ""IndexingError: Unalignable boolean Series key provided"" if I index df directly # well, it must think it's so clever train_store_indices = (train.Store.values==store) y_train = train.Sales.values[train_store_indices] X_train = train[columns].values[train_store_indices] model = LinearRegression(normalize=True,n_jobs=-1).fit(X_train,y_train) test_store_indices = (test_nona.Store==store) X_test = test_nona[columns].values[test_store_indices] test_nona.Sales.values[test_store_indices] = model.predict(X_test)'",No,2,27.0 "test['Sales'] = test_nona.Sales test = test.fillna(0) # we didn't predict some stores with NAs, tell Kaggle to ignore them",No,5,17.0 "test[['Sales']].to_csv(""submission.csv"")'",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory from subprocess import check_output print(check_output([""ls"", ""../input""]).decode(""utf8"")) # Any results you write to the current directory are saved as output.",No,5,88.0 "import sklearn.cross_validation as cv import sklearn.preprocessing as preprocessing import sklearn.feature_extraction as fe import sklearn.ensemble as es",No,5,22.0 "b""store = pd.read_csv('../input/store.csv')\ntrain = pd.read_csv('../input/train.csv',low_memory=False) # low_memory\ntest = pd.read_csv('../input/test.csv')""",No,5,45.0 "test.fillna(1, inplace=True) train = train[train[""Open""] != 0]",No,4,17.0 "train = pd.merge(train,store,on='Store') test = pd.merge(test,store,on='Store')",No,5,32.0 "sale_means = train.groupby('Store').mean().Sales sale_means.name = 'Sales=_Means' train = train.join(sale_means,on='Store') test = test.join(sale_means,on='Store')",No,2,32.0 "y = train.Sales.tolist() train_ = train.drop(['Date','Sales','Store','Customers'],axis=1).fillna(0) train_dic = train_.fillna(0).to_dict('records') test_dic = test.drop([""Date"",""Store"",""Id""],axis=1).fillna(0).to_dict('records')'",No,3,10.0 "dv = fe.DictVectorizer() X = dv.fit_transform(train_dic) Xo = dv.transform(test_dic)",No,5,8.0 "maxmin = preprocessing.MinMaxScaler() X = maxmin.fit_transform(X.toarray()) Xo = maxmin.transform(Xo.toarray())",No,5,18.0 "clf = es.RandomForestRegressor(n_estimators=25) clf.verbose = True clf.n_jobs = 8 clf",No,5,4.0 "clf.fit(Xtrain,Ytrain) print (""Training Score :"" + str(clf.score(Xtrain,Ytrain))) print (""Test Score : "" + str(clf.score(Xtest,Ytest)) )",No,3,7.0 "Yresult = clf.predict(Xtest) Yresult = np.array(Yresult) Ytest = np.array(Ytest)",No,5,48.0 "result = clf.predict(Xo) output = pd.DataFrame(test.Id).join(pd.DataFrame(result,columns=['Sales'])) output.to_csv('output.csv',index=False)",No,4,25.0 "import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np sns.set_style('whitegrid') %matplotlib inline # df_train = pd.read_csv(""../input/train.csv"") df_store = pd.read_csv(""../input/store.csv"") df_test = pd.read_csv(""../input/test.csv"") # df_train['Year'] = df_train['Date'].apply(lambda x: int(x[:4])) df_train['Month'] = df_train['Date'].apply(lambda x: int(x[5:7])) df_train.head()'",No,3,22.0 "cust_sales = pd.DataFrame() cust_sales['Customers'] = df_train['Customers'] cust_sales['Sales'] = df_train['Sales'] correlation_matrix = cust_sales.corr().abs() plt.subplots(figsize=(13, 9)) sns.heatmap(correlation_matrix,annot=True)",No,3,33.0 "df_train[""HolidayBin""] = df_train['StateHoliday'].map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1}) sns.factorplot(x =""Year"", y =""Sales"", hue =""Promo"", data = df_train, size = 4, kind =""box"", palette =""muted"") sns.factorplot(x =""Year"", y =""Sales"", hue =""SchoolHoliday"", data = df_train, size = 4, kind =""box"", palette =""muted"") sns.factorplot(x =""Year"", y =""Sales"", hue =""HolidayBin"", data = df_train, size = 4, kind =""box"", palette =""muted"")'",No,5,33.0 "# df_train['StateHoliday'] = df_train['StateHoliday'].replace(0, '0') df_train[""HolidayBin""] = df_train['StateHoliday'].map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1}) sns.factorplot(x =""Year"", y =""Sales"", hue =""StateHoliday"", data = df_train, size = 6, kind =""bar"", palette =""muted"")'",Yes,3,16.0 "import spacy nlp = spacy.blank(""en"") def get_token_num_by_offset(s, offset): s_pre = s[:offset] return len(spacy_tok.tokenizer(s_pre)) # note that 'xxunk' is not special in this sense special_tokens = ['xxbos','xxfld','xxpad', 'xxmaj','xxup','xxrep','xxwrep'] def adjust_token_num(processed, token_num): """""" As fastai tokenizer introduces additional tokens, we need to adjust for them. """""" counter = -1 do_unrep = None for i, token in enumerate(processed): if token not in special_tokens: counter += 1 if do_unrep: do_unrep = False if processed[i+1] != ""."": token_num -= (int(token) - 2) # one to account for the num itself else: # spacy doesn't split full stops token_num += 1 if token == ""xxrep"": do_unrep = True if counter == token_num: return i else: counter = -1 for i, t in enumerate(processed): if t not in special_tokens: counter += 1 print(i, counter, t) raise Exception(f""{token_num} is out of bounds ({processed})"")'",Yes,3,8.0 "def dataframe_to_tensors(df, max_len=512): # offsets are: pron_tok_offset, a_tok_offset, a_tok_right_offset, b_tok_offset, b_tok_right_offset offsets = list() labels = np.zeros((len(df),), dtype=np.int64) processed = list() for i, row in tqdm(df.iterrows()): try: text = row[""Text""] a_offset = row[""A-offset""] a_len = len(nlp(row[""A""])) b_offset = row[""B-offset""] b_len = len(nlp(row[""B""])) pron_offset = row[""Pronoun-offset""] is_a = row[""A-coref""] is_b = row[""B-coref""] a_tok_offset = get_token_num_by_offset(text, a_offset) b_tok_offset = get_token_num_by_offset(text, b_offset) a_right_offset = a_tok_offset + a_len - 1 b_right_offset = b_tok_offset + b_len - 1 pron_tok_offset = get_token_num_by_offset(text, pron_offset) tokenized = tokenizer.process_text(text, spacy_tok)[:max_len] tokenized = [""xxpad""] * (max_len - len(tokenized)) + tokenized # add padding a_tok_offset = adjust_token_num(tokenized, a_tok_offset) a_tok_right_offset = adjust_token_num(tokenized, a_right_offset) b_tok_offset = adjust_token_num(tokenized, b_tok_offset) b_tok_right_offset = adjust_token_num(tokenized, b_right_offset) pron_tok_offset = adjust_token_num(tokenized, pron_tok_offset) numericalized = vocab.numericalize(tokenized) processed.append(torch.tensor(numericalized, dtype=torch.long)) offsets.append([pron_tok_offset, a_tok_offset, a_tok_right_offset, b_tok_offset, b_tok_right_offset]) if is_a: labels[i] = 0 elif is_b: labels[i] = 1 else: labels[i] = 2 except Exception as e: print(i) raise processed = torch.stack(processed) offsets = torch.tensor(offsets, dtype=torch.long) labels = torch.from_numpy(labels) return processed, offsets, labels",No,4,12.0 "train_ds = TensorDataset(*dataframe_to_tensors(test)) valid_ds = TensorDataset(*dataframe_to_tensors(val)) test_ds = TensorDataset(*dataframe_to_tensors(train))",No,4,13.0 "train_dl = DataLoader(train_ds, batch_size=64, shuffle=True) valid_dl = DataLoader(valid_ds, batch_size=32, shuffle=False) test_dl = DataLoader(test_ds, batch_size=32, shuffle=False)",No,3,45.0 lm.freeze(),No,2,23.0 "encoder_hidden_sz = 400 device = torch.device(""cuda"") class CorefResolver(nn.Module): def __init__(self, encoder, dropout_p=0.3): super(CorefResolver, self).__init__() self.encoder = encoder self.dropout = nn.Dropout(dropout_p) self.hidden2hidden = nn.Linear(encoder_hidden_sz * 2 + 1, 25) self.hidden2logits = nn.Linear(50, 3) self.relu = nn.ReLU() self.activation = nn.LogSoftmax(dim=1) self.loss = nn.NLLLoss() def forward(self, seqs, offsets, labels=None): encoded = self.dropout(self.encoder(seqs)[0][2]) a_q = list() b_q = list() for enc, offs in zip(encoded, offsets): # extract the hidden states that correspond to A, B and the pronoun, and make pairs of those a_repr = enc[offs[2]] b_repr = enc[offs[4]] a_q.append(torch.cat([enc[offs[0]], a_repr, torch.dot(enc[offs[0]], a_repr).unsqueeze(0)])) b_q.append(torch.cat([enc[offs[0]], b_repr, torch.dot(enc[offs[0]], b_repr).unsqueeze(0)])) a_q = torch.stack(a_q) b_q = torch.stack(b_q) # apply the same ""detector"" layer to both batches of pairs is_a = self.relu(self.dropout(self.hidden2hidden(a_q))) is_b = self.relu(self.dropout(self.hidden2hidden(b_q))) # concatenate outputs of the ""detector"" layer to get the final probability distribution is_a_b = torch.cat([is_a, is_b], dim=1) is_logits = self.hidden2logits(self.dropout(self.relu(is_a_b))) activation = self.activation(is_logits) if labels is not None: return activation, self.loss(activation, labels) else: return activation",Yes,3,23.0 enc = lm.model[0],No,3,4.0 resolver = CorefResolver(enc),No,2,53.0 resolver.to(device),No,3,23.0 "for param in resolver.encoder.parameters(): param.requires_grad = False",No,4,23.0 "lr = 0.001 loss_fn = nn.NLLLoss() optimizer = torch.optim.Adam(resolver.parameters(), lr=lr)",No,4,4.0 from sklearn.metrics import classification_report,No,5,22.0 "def train_epoch(model, optimizer, train_dl, report_every=10): model.train() step = 0 total_loss = 0 for texts, offsets, labels in train_dl: texts, offsets, labels = texts.to(device), offsets.to(device), labels.to(device) step += 1 optimizer.zero_grad() _, loss = model(texts, offsets, labels) total_loss += loss.item() loss.backward() optimizer.step() if step % report_every == 0: print(f""Step {step}, loss: {total_loss/report_every}"") total_loss = 0 def evaluate(model, optimizer, valid_dl, probas=False): probas = list() model.eval() predictions = list() total_loss = 0 all_labels = list() with torch.no_grad(): for texts, offsets, labels in valid_dl: texts, offsets, labels = texts.cuda(), offsets.cuda(), labels.cuda() preds, loss = model(texts, offsets, labels) total_loss += loss.item() probas.append(preds.cpu().detach().numpy()) predictions.extend([i.item() for i in preds.max(1)[1]]) print(f""Validation loss: {total_loss/len(valid_dl)}"") print() print(classification_report(valid_dl.dataset.tensors[2].numpy(), predictions)) if probas: return total_loss, np.vstack(probas) return total_loss, predictions",Yes,4,2.0 "total_epoch = 0 best_loss = 1e6 for i in range(3): print(""Epoch"", i + 1) total_epoch += 1 train_epoch(resolver, optimizer, train_dl) loss, labels = evaluate(resolver, optimizer, valid_dl) if loss < best_loss: best_loss = loss print(f""Loss improved, saving {total_epoch}"") torch.save(resolver.state_dict(), data_path/""model_best.pt"")",No,5,7.0 "for param in resolver.encoder.parameters(): param.requires_grad = True",No,4,59.0 "lr = 3e-4 optimizer = torch.optim.Adam(resolver.parameters(), lr=lr)",No,5,4.0 x = np.load('../input/prepare-for-submission/0.npy'),No,5,44.0 "from keras.models import load_model import tensorflow as tf def logloss(y, y_): return tf.losses.log_loss(y,y_) model = load_model(mf, custom_objects={'logloss':logloss})",No,4,4.0 import gc,No,5,22.0 y1 = model.predict(x),No,5,27.0 "for i in range(6): print(""Epoch"", i + 1) total_epoch += 1 train_epoch(resolver, optimizer, train_dl) loss, labels = evaluate(resolver, optimizer, valid_dl) if loss < best_loss: best_loss = loss print(f""Loss improved, saving {total_epoch}"") torch.save(resolver.state_dict(), data_path/""model_best.pt"")",No,4,2.0 "resolver.load_state_dict(torch.load(data_path/""model_best.pt""))",No,5,44.0 "loss, res = evaluate(resolver, optimizer, test_dl, True) res_s = np.exp(res) # don't forget that we have log-softmax outputs: submission = pd.DataFrame(res_s, index=train[""ID""], columns=[""A"", ""B"", ""NEITHER""]) submission.to_csv(""submission.csv"", index=""id"")'",Yes,3,27.0 "import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV from sklearn.model_selection import validation_curve from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import cross_val_score from sklearn.metrics import r2_score %matplotlib inline",No,5,23.0 "del x gc.collect()",No,1,53.0 "x = np.load('../input/prepare-for-submission-2/1.npy') y2 = model.predict(x) del x gc.collect()",No,2,27.0 "x = np.load('../input/prepare-for-submission-3/2.npy') y3 = model.predict(x) del x gc.collect()",No,3,27.0 "y = np.concatenate([y1, y2, y3])",No,5,11.0 "b""df = pd.DataFrame()\ndf['ID'] = pd.read_csv('../input/gendered-pronoun-resolution/test_stage_2.tsv', delimiter='\\t')['ID']""",No,5,45.0 "df_train = pd.read_csv('../input/train.csv') df_test = pd.read_csv('../input/test.csv') df_store = pd.read_csv('../input/store.csv')",No,5,45.0 df_s = pd.read_csv('../input/gendered-pronoun-resolution/sample_submission_stage_2.csv'),No,5,45.0 "df['A'] = y[:,0] #, 'B', 'NEITHER'",No,5,8.0 "df['B'] = y[:,1] df['NEITHER'] = y[:,2]",No,5,8.0 "y = df_train[""Sales""].values",No,5,21.0 "df.to_csv('submission.csv', index=False)",No,5,25.0 "df_train['Year'] = df_train['Date'].apply(lambda x: int(x[0:4])) df_train['Month'] = df_train['Date'].apply(lambda x: int(x[5:7])) df_train['Day'] = df_train['Date'].apply(lambda x: int(x[8:10])) df_test['Year'] = df_test['Date'].apply(lambda x: int(x[0:4])) df_test['Month'] = df_test['Date'].apply(lambda x: int(x[5:7])) df_test['Day'] = df_test['Date'].apply(lambda x: int(x[8:10]))",No,5,8.0 "b""import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n%matplotlib inline\n\nplt.style.use('ggplot')\nplt.rcParams['figure.figsize'] = (12,8)\n\n# \nfont = {'family': 'Verdana',\n 'weight': 'normal'}\nplt.rc('font', **font)""",No,5,23.0 "data_train = pd.read_csv(""../input/train.csv"") data_test = pd.read_csv(""../input/test.csv"") data_store = pd.read_csv(""../input/store.csv"")",No,5,45.0 data_train.head(),No,5,41.0 "df_store.CompetitionDistance.fillna(value=0, inplace=True) df_test.Open.fillna(value=0, inplace=True) df_train.StateHoliday[df_train[""StateHoliday""] == 0] = ""0""",Yes,5,17.0 "print(df_train.shape) print(df_test.shape) print(df_store.shape)",No,5,58.0 "fig, (axis1,axis2) = plt.subplots(1,2,figsize=(12,4)) sns.barplot(x='Year', y='Sales', data=df_train, ax=axis1) sns.barplot(x='Year', y='Customers', data=df_train, ax=axis2)",No,5,75.0 data_train.shape,No,5,58.0 "df_train.query('Open == 1')[['Sales', 'Customers']].hist(bins=100, figsize=(13,7));",No,5,33.0 data_test.columns,No,5,71.0 "fig, (axis1,axis2) = plt.subplots(1,2,figsize=(12,4)) sns.barplot(x='Month', y='Sales', data=df_train, ax=axis1) sns.barplot(x='Month', y='Customers', data=df_train, ax=axis2)",No,5,75.0 data_store.head(n=3),No,5,41.0 "fig, (axis1,axis2) = plt.subplots(1,2,figsize=(12,4)) sns.barplot(x='DayOfWeek', y='Sales', data=df_train, ax=axis1) sns.barplot(x='DayOfWeek', y='Customers', data=df_train, ax=axis2)",No,5,75.0 "data_train.StateHoliday = data_train.StateHoliday.replace(0,'0') data_test.StateHoliday = data_test.StateHoliday.replace(0,'0') data_train.DayOfWeek.value_counts()",No,4,8.0 "df_train[['Sales', 'Customers']].corr()",No,3,41.0 "df_DayOfWeek = pd.get_dummies(df_train.DayOfWeek, prefix='DayOfWeek') df_StateHoliday = pd.get_dummies(df_train.StateHoliday, prefix=""StateHoliday_"") df_train = pd.concat([df_train, df_DayOfWeek, df_StateHoliday], axis=1) del df_train[""Date""] del df_train[""Day""] del df_train[""Customers""] del df_train[""DayOfWeek""] del df_train[""Sales""] del df_train[""StateHoliday""]'",Yes,4,12.0 "df_StoreType = pd.get_dummies(df_store.StoreType, prefix='StoreType_') df_Assortment = pd.get_dummies(df_store.Assortment, prefix='Assortment_') df_store = pd.concat([df_store, df_StoreType, df_Assortment], axis=1) del df_store[""StoreType""] del df_store[""Assortment""] del df_store[""PromoInterval""]'",Yes,4,12.0 "df = pd.merge(df_train, df_store, how='left', on=['Store'])",No,5,32.0 "df.fillna(0, inplace=True)",No,5,17.0 "X = df.values[:,1:]",No,5,21.0 "parametrs = range(40, 241, 40)",No,5,5.0 "b""scores, tst_scr = validation_curve(RandomForestRegressor(n_jobs = 4), X[:20000],\\\n y[:20000], 'n_estimators', parametrs, cv=5, scoring='r2', verbose=2)""",No,5,2.0 "scores_mean = scores.mean(axis=1) scores_std = scores.std(axis=1) tst_scr_mean = tst_scr.mean(axis=1) tst_scr_std = tst_scr.std(axis=1) plt.plot(parametrs, tst_scr_mean) plt.fill_between(parametrs, tst_scr_mean + tst_scr_std, tst_scr_mean - tst_scr_std, alpha=0.3) plt.plot(parametrs, scores_mean) plt.fill_between(parametrs, scores_mean + scores_std, scores_mean - scores_std, alpha=0.3)",No,5,35.0 "parametrs = range(3, 24)",No,5,5.0 "b""scores, tst_scr = validation_curve(RandomForestRegressor(n_estimators=120, n_jobs = 4), X[:20000], \\\n y[:20000], 'max_features', parametrs, cv=3, scoring='r2', verbose=2)""",No,5,1.0 "data_train['Year'] = data_train['Date'].apply(lambda x: int(x[:4])) data_train['Month'] = data_train['Date'].apply(lambda x: int(x[5:7])) data_train.head()",No,4,8.0 "parametrs = range(4, 61, 4)",No,5,5.0 "average_sales_per_month = data_train.groupby('Month')[""Sales""].mean() plt.figure(figsize=(8, 5)) average_sales_per_month.plot(legend=True, marker='o', title=""Average sales per month"")'",No,4,33.0 "b""scores, tst_scr = validation_curve(RandomForestRegressor(n_estimators=120, n_jobs = 4, max_features=16), X[:20000], \\\n y[:20000], 'max_depth', parametrs, cv=3, scoring='r2', verbose=2)""",No,5,84.0 "average_sales_per_day = data_train.groupby('Date')[""Sales""].mean() fig = plt.subplots(1,1, sharex=True, figsize=(18, 5)) average_sales_per_day.plot(legend=True, title=""Average Daily Sales"")'",No,3,33.0 "model = RandomForestRegressor(n_estimators=120, max_depth=20, max_features=16, n_jobs=4, verbose=2)",No,5,4.0 "model.fit(X, y)",No,5,7.0 idx = model.feature_importances_.argsort()[::-1],No,5,79.0 "ax = sns.barplot(x=model.feature_importances_[idx], y=df.drop('Store', axis=1).columns[idx])",No,5,79.0 "df_DayOfWeek = pd.get_dummies(df_test.DayOfWeek, prefix='DayOfWeek') df_StateHoliday = pd.get_dummies(df_test.StateHoliday, prefix=""StateHoliday_"") df_StateHoliday = pd.concat([df_StateHoliday, pd.DataFrame(columns=['StateHoliday__b', 'StateHoliday__c'])], axis = 1)'",Yes,3,20.0 "columns_corr = ['Sales', 'Customers', 'Promo', 'StateHoliday', 'SchoolHoliday'] data_train[columns_corr].corr(method='pearson')",No,5,40.0 "df_StateHoliday.fillna(0, inplace=True)",No,5,17.0 "data_train['StateHoliday'] = data_train['StateHoliday'].replace(0, '0') data_train[""HolidayBin""] = data_train['StateHoliday'].map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1}) data_train.StateHoliday.unique()'",No,4,8.0 "del df_test[""Date""] del df_test[""Day""] del df_test[""DayOfWeek""] del df_test[""StateHoliday""] df_test = pd.concat([df_test, df_DayOfWeek, df_StateHoliday], axis=1) test_df = pd.merge(df_test, df_store, how='left', on=['Store']) test_df.fillna(0, inplace=True)'",Yes,4,12.0 "y_test_pred = model.predict(test_df.values[:,2:])",No,5,48.0 "average_customers_per_month = data_train.groupby('Month')['Customers'].mean() average_sales_per_month = data_train.groupby('Month')['Sales'].mean()",No,4,60.0 "submission = pd.DataFrame({ ""Id"": test_df.Id, ""Sales"": y_test_pred.reshape(-1.1)})",No,5,12.0 "plt.figure(figsize=(6, 4)) plt.plot(average_sales_per_month)",No,5,33.0 "plt.figure(figsize=(6, 4)) plt.plot(average_customers_per_month)",No,5,33.0 "submission.to_csv(""rossman.csv"",index=False)",No,5,25.0 "import pandas as pd import sklearn import numpy as np import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestRegressor import seaborn as sns %matplotlib inline ",No,5,23.0 "total_customers_for_store = data_train.groupby('Store')['Sales', 'Customers'].sum()",No,5,60.0 "data_total_customers_for_store = pd.DataFrame({'Sales': total_customers_for_store['Sales'], 'Customers': total_customers_for_store['Customers']}, index = total_customers_for_store.index)",No,5,12.0 "storedf = pd.read_csv(""../input/store.csv"") storedf = storedf [[""Store"", ""Assortment"", ""CompetitionDistance"",""Promo2""]] storedf = storedf.set_index(""Store"") storedf.CompetitionDistance = storedf.CompetitionDistance.fillna(storedf.CompetitionDistance.max()) storedf.head()",Yes,3,45.0 data_total_customers_for_store = data_total_customers_for_store.reset_index(),No,5,61.0 "average_sales_customers = data_train.groupby('Store')['Sales', 'Customers'].mean()",No,5,60.0 "def f(traindf): #traindf = traindf[traindf.Open==1] traindf = traindf.join(storedf, on=""Store"") traindf['isWeekEnd'] = traindf.DayOfWeek>=5 traindf['Month'] = list (map (lambda x: int(x[5:7]), traindf.Date)) traindf['Day'] = list (map (lambda x: int(x[8:]), traindf.Date)) traindf['isWinter'] = np.logical_or (traindf.Month <= 2, traindf.Month == 12) traindf['isSpring'] = np.logical_and (traindf.Month >= 3, traindf.Month <= 5) traindf['isSummer'] = np.logical_and (traindf.Month >= 6, traindf.Month <= 8) traindf['isAutumn'] = np.logical_and (traindf.Month >= 9, traindf.Month <= 11) traindf['AssortmentA'] = traindf.Assortment=='a' traindf['AssortmentB'] = traindf.Assortment=='b' traindf['AssortmentC'] = traindf.Assortment=='c' traindf['isEndofMonth'] = traindf.Day >= 25 traindf['isBeginofMonth'] = traindf.Day <= 10 traindf['CompetitionDistance'] = traindf.CompetitionDistance del traindf [""Assortment""] del traindf [""StateHoliday""] del traindf [""SchoolHoliday""] del traindf [""Date""] del traindf [""Store""] del traindf [""DayOfWeek""] del traindf [""Month""] del traindf [""Day""] return traindf'",Yes,3,14.0 "data_average_sales_customers = pd.DataFrame({'Sales': average_sales_customers['Sales'], 'Customers': average_sales_customers['Customers']}, index = average_sales_customers.index) data_average_sales_customers = data_average_sales_customers.reset_index() data_stores_average = data_average_sales_customers.join(data_store.set_index('Store'), on='Store') data_stores_average.head()",Yes,4,12.0 "data_average_sales_customers = pd.DataFrame({'Sales': average_sales_customers['Sales'], 'Customers': average_sales_customers['Customers']}, index = average_sales_customers.index)",No,5,12.0 "traindf = pd.read_csv(""../input/train.csv"", low_memory=False) traindf = f(traindf) traindf = traindf[traindf.Open == 1] ytrain = traindf.Sales.values del traindf [""Sales""] del traindf [""Customers""] del traindf [""Open""] traindf.head()",Yes,4,45.0 data_average_sales_customers = data_average_sales_customers.reset_index(),No,5,84.0 "testdf = pd.read_csv(""../input/test.csv"") testdf = f(testdf) testdf.head()",Yes,4,45.0 "data_stores_average = data_average_sales_customers.join(data_store.set_index('Store'), on='Store')",No,5,32.0 "model = RandomForestRegressor(min_samples_leaf=2, max_depth=30, n_estimators=30) %time model.fit(traindf.values, ytrain)",No,5,7.0 data_stores_average.head(n=3),No,5,41.0 "data_stores_new = data_total_customers_for_store.join(data_store.set_index('Store'), on='Store')",No,5,32.0 "average_store_type = data_stores_new.groupby('StoreType')['Sales', 'Customers', 'CompetitionDistance'].mean()",No,5,60.0 "y = model.predict(testdf.values[:, 2:]) df = pd.DataFrame([]) df['Sales'] = y df['Sales'][testdf.Open == 0] = 0 df = df.set_index(testdf.Id) pd.DataFrame.to_csv(df, 'ans.csv') df.head()",Yes,4,27.0 "Data_cmp = pd.DataFrame() Data_cmp['Customers'] = average_store_type['Sales'] Data_cmp['Sales'] = average_store_type['Customers'] Data_cmp['Comp'] = average_store_type['CompetitionDistance']",No,4,12.0 "columns_corr = ['Sales', 'Customers', 'Comp'] Data_cmp[columns_corr].corr(method='pearson')",No,5,40.0 "average_assort = data_stores_new.groupby('Assortment')['Sales', 'Customers'].mean()",No,5,60.0 "closed_store_data = data_test[""Id""][data_test[""Open""] == 0].values data_train.StateHoliday = data_train.StateHoliday.replace(0,'0') data_test.StateHoliday = data_test.StateHoliday.replace(0,'0')'",No,5,8.0 "data_train['Year'] = data_train['Date'].apply(lambda x: int(x[:4])) data_train['Month'] = data_train['Date'].apply(lambda x: int(x[5:7])) data_train[""HolidayBin""] = data_train.StateHoliday.map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})'",No,4,8.0 "del data_train['Date'] del data_train['StateHoliday']",No,5,10.0 "data_test['Year'] = data_test['Date'].apply(lambda x: int(x[:4])) data_test['Month'] = data_test['Date'].apply(lambda x: int(x[5:7])) data_test[""HolidayBin""] = data_test.StateHoliday.map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})'",No,5,8.0 "del data_test['Date'] del data_test['StateHoliday']",No,5,10.0 "data_test = data_test[data_test[""Open""] != 0]",No,5,14.0 data_test[data_test['Store'] == 1].head(),No,5,41.0 "arr_tmp = [] for i in data_test['Store']: arr_tmp.append(float(data_store['CompetitionDistance'][data_store['Store'] == i])) data_test['CompetitionDistance'] = arr_tmp",No,3,8.0 "arr_tmp = [] for i in data_train['Store']: arr_tmp.append(float(data_store['CompetitionDistance'][data_store['Store'] == i])) data_train['CompetitionDistance'] = arr_tmp data_train['CompetitionDistance'] = data_train['CompetitionDistance'].fillna(data_train['CompetitionDistance'].mean())",No,3,8.0 "train_stores = dict(list(data_train.groupby('Store'))) test_stores = dict(list(data_test.groupby('Store')))",No,4,60.0 "from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import GridSearchCV",No,5,22.0 "# result = pd.Series() for i in test_stores: store = train_stores[i] X_train = store.drop([""Sales"", ""Store"", ""Customers""],axis=1) Y_train = store[""Sales""] X_test = test_stores[i].copy() store_ind = X_test[""Id""] X_test.drop([""Id"",""Store""], axis=1,inplace=True) X_train = X_train.fillna(X_train.mean()) X_test = X_test.fillna(X_train.mean()) estimator = RandomForestRegressor(n_estimators=10, max_depth=13, criterion = 'mse') estimator.fit(X_train, Y_train) Y_pred = estimator.predict(X_test) result = result.append(pd.Series(Y_pred, index=store_ind)) result = result.append(pd.Series(0, index=closed_store_data)) result = pd.DataFrame({ ""Id"": result.index, ""Sales"": result.values}) result.to_csv('result_new.csv', index=False)'",Yes,4,25.0 "print(df_test.shape, df_test.columns)",Yes,3,17.0 "for c in rlnrs.keys(): X_c = df_test[(df_test.label == c) & (df_test.Open != 0)].drop(['Store', 'Date', 'Sales'], axis=1).values df_test.loc[(df_test.label == c) & (df_test.Open != 0), 'Sales'] = rlnrs[c].predict(X_c) df_test.Sales.mean(), df_test.Sales.min(), df_test.Sales.max()",Yes,2,16.0 "out = pd.DataFrame({ ""Id"": df_test.index, ""Sales"": df_test.Sales.values }) out.to_csv('submission.csv', index=False)'",No,2,58.0 "import numpy as np import pandas as pd from sklearn.model_selection import GridSearchCV from lightgbm import LGBMRegressor from sklearn.metrics import make_scorer from sklearn.preprocessing import LabelEncoder, Imputer, OneHotEncoder, FunctionTransformer from sklearn.preprocessing import StandardScaler from sklearn.base import TransformerMixin from sklearn.pipeline import make_union, make_pipeline %matplotlib inline",No,5,23.0 "df_train = pd.read_csv(""../input/train.csv"", parse_dates=[""Date""], date_parser=pd.to_datetime, low_memory=False) df_test = pd.read_csv(""../input/test.csv"", parse_dates=[""Date""], date_parser=pd.to_datetime, low_memory=False) df_store = pd.read_csv(""../input/store.csv"")",No,5,45.0 "train = df_train.merge(df_store) test = df_test.merge(df_store)",No,5,32.0 "print(""train"") print(""max: "", df_train.Date.min()) print(""min:"", df_train.Date.max()) print(""delta: "", df_train.Date.max() - df_train.Date.min())",No,5,40.0 "print(""test"") print(""max: "", df_test.Date.min()) print(""min:"", df_test.Date.max()) print(""delta: "", df_test.Date.max() - df_test.Date.min())",No,5,40.0 "df_train.groupby(""DayOfWeek"").agg({""Sales"": ""mean""}).plot(kind=""bar"")",No,5,33.0 df_train[(df_train.DayOfWeek == 7) & (df_train.Open == 1)].Store.unique().shape[0],No,5,54.0 "df_train.groupby(""StateHoliday"").agg({""Sales"": ""mean""}).plot(kind=""bar"")",No,5,33.0 "df_train.groupby(""SchoolHoliday"").agg({""Sales"": ""mean""}).plot(kind=""bar"")",No,5,33.0 "df_train.groupby(""Promo"").agg({""Sales"": ""mean""}).plot(kind=""bar"")",No,5,33.0 train.PromoInterval.unique(),No,5,57.0 "def get_Promo2Active(df): months_map = {v:i+1 for i, v in enumerate([""Jan"", ""Feb"", ""Mar"", ""Apr"", ""May"", ""Jun"", ""Jul"", ""Aug"", ""Sept"", ""Oct"", ""Nov"", ""Dec""])} def is_Promo2_active(row): if row.Promo2 == 0: return 0 current_week, current_month, current_year = row.Date.week, row.Date.month, row.Date.year start_week, start_year = row.Promo2SinceWeek, row.Promo2SinceYear active_months = set([months_map[m] for m in row.PromoInterval.split("","")]) has_started = (current_year == start_year and current_week >= start_week) or current_year > start_year return int(has_started and current_month in active_months) return df.apply(is_Promo2_active, axis=1)",No,5,8.0 "def get_CompetitionActive(df): def is_competition_active(row): if np.isnan(row.CompetitionDistance): return 0 if np.isnan(row.CompetitionOpenSinceMonth) and np.isnan(row.CompetitionOpenSinceYear): return 1 current_month, current_year = row.Date.month, row.Date.year opened_month, opened_year = row.CompetitionOpenSinceMonth, row.CompetitionOpenSinceYear return int((current_year == opened_year and current_month >= opened_month) or current_year > opened_year) return df.apply(is_competition_active, axis=1)",No,5,8.0 "train[""Promo2Active""] = get_Promo2Active(train) train[""CompetitionActive""] = get_CompetitionActive(train)",No,5,8.0 "train[train.Promo2 == 1].groupby(""Promo2Active"").agg({""Sales"": ""mean""}).plot(kind=""bar"")",No,5,60.0 "train.groupby(""CompetitionActive"").agg({""Sales"": ""mean""}).plot(kind=""bar"")",No,5,33.0 "# we already have these, but for the sake of consistence, let's do it again train = df_train.merge(df_store) test = df_test.merge(df_store) train[""Promo2Active""] = get_Promo2Active(train) train[""CompetitionActive""] = get_CompetitionActive(train) test[""Promo2Active""] = get_Promo2Active(test) test[""CompetitionActive""] = get_CompetitionActive(test)'",No,4,8.0 "train[""DayOfYear""] = train.Date.apply(lambda x: x.timetuple().tm_yday) test[""DayOfYear""] = test.Date.apply(lambda x: x.timetuple().tm_yday)",No,5,16.0 "min_date = train.Date.min() # we know that all test data happened later def date_to_day_number(df): return (df.Date - min_date).apply(lambda x: x.days)",No,5,8.0 "train[""Day""] = date_to_day_number(train) test[""Day""] = date_to_day_number(test)",No,5,8.0 "train.sort_values(""Day"", inplace=True) test.sort_values(""Day"", inplace=True)",No,5,9.0 "def rmspe(y_true, y_pred): w = np.zeros(y_true.shape, dtype=float) ind = y_true != 0 w[ind] = 1./ (y_true[ind]**2) return np.sqrt(np.mean(w * (y_true - y_pred)**2)) rmspe_scorer = make_scorer(rmspe, greater_is_better=False)",No,5,84.0 "train_baseline = train.copy() train_baseline['Last_Week_Sales'] = train_baseline.groupby(""Store"")[""Sales""].shift() train_baseline['Last_Week_Diff'] = train_baseline.groupby(""Store"")[""Last_Week_Sales""].diff() train_baseline.dropna(inplace=True, subset=[""Last_Week_Sales"", ""Last_Week_Diff""]) train_baseline.head()'",No,2,60.0 "mean_error = [] for day in range(2, train_baseline.Day.max() + 1): val = train_baseline[train_baseline.Day == day] p = val.Last_Week_Sales.values error = rmspe(val.Sales.values, p) mean_error.append(error) print('Mean Error = %.5f' % np.mean(mean_error))",No,2,60.0 "class LabelEncoderPipelineFriendly(LabelEncoder): def fit(self, X, y=None): """"""this would allow us to fit the model based on the X input."""""" super(LabelEncoderPipelineFriendly, self).fit(X) def transform(self, X, y=None): return super(LabelEncoderPipelineFriendly, self).transform(X).reshape(-1, 1) def fit_transform(self, X, y=None): return super(LabelEncoderPipelineFriendly, self).fit(X).transform(X).reshape(-1, 1)",No,5,20.0 "def prepare_pipeline(df): def get_DayOfWeek(df): return df[""DayOfWeek""] def get_Open(df): return df[[""Open""]] def get_Promo(df): return df[[""Promo""]] def get_StateHoliday(df): return df[""StateHoliday""] def get_SchoolHoliday(df): return df[[""SchoolHoliday""]] def get_StoreType(df): return df[""StoreType""] def get_Assortment(df): return df[""Assortment""] def get_Promo2Active(df): return df[[""Promo2Active""]] def get_CompetitionActive(df): return df[[""CompetitionActive""]] def get_CompetitionDistance(df): return df[[""CompetitionDistance""]] def get_DayOfYear(df): return df[""DayOfYear""] p = make_union(*[ make_pipeline(FunctionTransformer(get_DayOfWeek, validate=False), LabelEncoderPipelineFriendly(), OneHotEncoder()), make_pipeline(FunctionTransformer(get_Open, validate=False), Imputer(strategy=""most_frequent"")), make_pipeline(FunctionTransformer(get_Promo, validate=False)), make_pipeline(FunctionTransformer(get_StateHoliday, validate=False), LabelEncoderPipelineFriendly(), OneHotEncoder()), make_pipeline(FunctionTransformer(get_SchoolHoliday, validate=False)), make_pipeline(FunctionTransformer(get_StoreType, validate=False), LabelEncoderPipelineFriendly(), OneHotEncoder()), make_pipeline(FunctionTransformer(get_Assortment, validate=False), LabelEncoderPipelineFriendly(), OneHotEncoder()), make_pipeline(FunctionTransformer(get_Promo2Active, validate=False)), make_pipeline(FunctionTransformer(get_CompetitionActive, validate=False)), make_pipeline(FunctionTransformer(get_CompetitionDistance, validate=False), Imputer(), StandardScaler()), make_pipeline(FunctionTransformer(get_DayOfYear, validate=False), LabelEncoderPipelineFriendly(), OneHotEncoder()) ]) return p",No,5,4.0 pipeline = prepare_pipeline(train),No,3,4.0 "x_train, y_train = pipeline.fit_transform(train), train.Sales x_test = pipeline.transform(test)",No,4,7.0 "params = {""boosting_type"" : [""gbdt""], ""learning_rate"": [0.1], ""n_estimators"": [200], ""objective"": [""regression""], ""reg_alpha"": [1.0],# [0.0, 0.5, 1.0], # no time for an actual CV on kaggle ""reg_lambda"": [1.0],# [0.0, 0.5, 1.0], ""random_state"": [0], ""n_jobs"": [-1] }",No,5,59.0 "gs = GridSearchCV(LGBMRegressor(), params, scoring=rmspe_scorer, cv=2, n_jobs=1) gs.fit(x_train, y_train)",No,5,6.0 prediction = gs.predict(x_test),No,5,48.0 "pd.DataFrame({""Id"": test.Id, ""Sales"": prediction}).to_csv(""submission.csv"", sep="","", index=False)",No,5,25.0 "def prepare_pipeline_ts(df, min_shift, max_shift): def get_shifted_date(df, for_sales=False): return (df.Date.min() + pd.DateOffset(days_to_shift)) def get_DayOfWeek(df): return df[""DayOfWeek""] def get_Open(df): return df[[""Open""]] def get_Promo(df): return df[[""Promo""]] def get_StateHoliday(df): return df[""StateHoliday""] def get_SchoolHoliday(df): return df[[""SchoolHoliday""]] def get_StoreType(df): return df[""StoreType""] def get_Assortment(df): return df[""Assortment""] def get_Promo2Active(df): return df[[""Promo2Active""]] def get_CompetitionActive(df): return df[[""CompetitionActive""]] def get_CompetitionDistance(df): return df[[""CompetitionDistance""]] def get_DayOfYear(df): return df[""DayOfYear""] def get_previous_sales(df): sales = df[[""Store"", ""Sales""]].copy() for day in range(min_shift, max_shift + 1): sales[""Last-{}_Day_Sales"".format(day)] = sales.groupby(""Store"")[""Sales""].shift(day) sales[""Last-{}_Day_Diff"".format(day)] = sales.groupby(""Store"")[""Last-{}_Day_Sales"".format(day)].diff() return sales.drop([""Store"", ""Sales""], axis=1) p = make_union(*[ make_pipeline(FunctionTransformer(get_DayOfWeek, validate=False), LabelEncoderPipelineFriendly(), OneHotEncoder()), make_pipeline(FunctionTransformer(get_Open, validate=False), Imputer(strategy=""most_frequent"")), make_pipeline(FunctionTransformer(get_Promo, validate=False)), make_pipeline(FunctionTransformer(get_StateHoliday, validate=False), LabelEncoderPipelineFriendly(), OneHotEncoder()), make_pipeline(FunctionTransformer(get_SchoolHoliday, validate=False)), make_pipeline(FunctionTransformer(get_StoreType, validate=False), LabelEncoderPipelineFriendly(), OneHotEncoder()), make_pipeline(FunctionTransformer(get_Assortment, validate=False), LabelEncoderPipelineFriendly(), OneHotEncoder()), make_pipeline(FunctionTransformer(get_Promo2Active, validate=False)), make_pipeline(FunctionTransformer(get_CompetitionActive, validate=False)), make_pipeline(FunctionTransformer(get_CompetitionDistance, validate=False), Imputer(), StandardScaler()), make_pipeline(FunctionTransformer(get_DayOfYear, validate=False), LabelEncoderPipelineFriendly(), OneHotEncoder()), make_pipeline(FunctionTransformer(get_previous_sales, validate=False), Imputer(), StandardScaler()) ]) return p",No,5,53.0 "test_size = len(test) min_shift = (test.Date.max() - test.Date.min()).days max_shift = 180 to_drop = len(train[train.Date < train.Date.min() + pd.DateOffset(max_shift)])",No,5,77.0 "import warnings warnings.filterwarnings(""ignore"") #Data Manipulation and Treatment import numpy as np import pandas as pd from datetime import datetime #Plotting and Visualizations import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns from scipy import stats import itertools #Scikit-Learn for Modeling from sklearn import model_selection from sklearn.ensemble import RandomForestRegressor from sklearn import metrics",No,5,23.0 "def str_to_date(date): return datetime.strptime(date, '%Y-%m-%d').date()",No,5,16.0 "#The training Set df_train = pd.read_csv(""../input/train.csv"",sep=',', parse_dates=['Date'] , date_parser=str_to_date, low_memory = False) #Additional Information on those stores df_store = pd.read_csv(""../input/store.csv"" , low_memory = False)'",No,5,45.0 df_train.head() ,No,5,41.0 df_train.tail(),No,5,41.0 "df_train.dtypes,print (""The Train dataset has {} Rows and {} Variables"".format(str(df_train.shape[0]),str(df_train.shape[1])))",No,3,40.0 df_store.tail(),No,5,41.0 "df_store.dtypes ,print (""The Store dataset has {} Rows (which means unique Shops) and {} Variables"".format(str(df_store.shape[0]),str(df_store.shape[1]))) ",No,3,40.0 df_train.count(0)/df_train.shape[0] * 100,No,2,54.0 "class bert(nn.Module): def __init__(self, bert_path): super().__init__() BERT = BertModel.from_pretrained(bert_path, config = BertConfig.from_pretrained(bert_path, output_hidden_states = True)) self.BERT = BERT self.fc = nn.Sequential(nn.BatchNorm1d(self.BERT.config.hidden_size * 3), nn.Dropout(0.4), nn.Linear(self.BERT.config.hidden_size * 3, 600), nn.BatchNorm1d(600), nn.Dropout(0.4), nn.Linear(600, 600), nn.BatchNorm1d(600), nn.Dropout(0.4), nn.Linear(600,3)) def forward(self, token, at_mask, offsets, layer): out = self.BERT(token, attention_mask = at_mask)[2][layer] out_lst = [] for j in range(out.shape[0]): out_lst.append(torch.stack([torch.tensor(out[j,offsets[j,0]]),torch.tensor(out[j,offsets[j,1]]),torch.tensor(out[j,offsets[j,2]])] , dim = 0) ) out_lst = torch.stack([word_embedding for word_embedding in out_lst], dim = 0) out = out_lst.reshape(out_lst.shape[0], -1) out = self.fc(out) return out def create_model(df_len,epoch_len): model = bert(bert_path) criteria = nn.CrossEntropyLoss() optimizer = AdamW(model.parameters(), eps = 1e-06, lr = 1e-4) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=df_len*epoch_len) return model, criteria, optimizer, scheduler",Yes,3,4.0 "epoch_len = 20 model, criteria, optimizer, scheduler = create_model(len(df_train), epoch_len) set_trainable(model.BERT, False) aaa = 0 for t in range(epoch_len): tot_loss = 0 correct_train = 0 val_loss = 0 val_correct = 0 model = model.train() if GPU: model = model.cuda() for item in tqdm(train_loader): token = item[0] at_mask = item[3] offsets = item[2] target = item[1] if GPU: token = token.cuda() at_mask = at_mask.cuda() target = target.cuda() offsets = offsets.cuda() output = model(token, at_mask, offsets, -2) loss = criteria(output, target) tot_loss += loss.item() correct_train += torch.sum(torch.max(torch.nn.functional.softmax(output, dim = 1), dim = 1)[1] == target) optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) optimizer.step() scheduler.step() with torch.no_grad(): model = model.eval() if GPU: model = model.cuda() for item in tqdm(val_loader): token = item[0] at_mask = item[3] offsets = item[2] target = item[1] if GPU: token = token.cuda() at_mask = at_mask.cuda() offsets = offsets.cuda() target = target.cuda() output = model(token, at_mask, offsets, -2) val_correct += torch.sum(torch.max(torch.nn.functional.softmax(output, dim = 1), dim = 1)[1] == target) if val_correct > aaa: bst_model = model aaa = val_correct print(tot_loss, correct_train,"" "", val_correct,"" out of "", len(val_loader)*30)",No,4,7.0 "def predict(df, dataloader, model): tmp_array = np.zeros((len(df), 3)) with torch.no_grad(): model = model.eval() if GPU: model = model.cuda() j = 0 for item in tqdm(dataloader): token = item[0] at_mask = item[2] offsets = item[1] if GPU: token = token.cuda() at_mask = at_mask.cuda() offsets = offsets.cuda() output = model(token, at_mask, offsets, -2) for zz in output.cpu(): tmp_array[j] = zz j+=1 return tmp_array",No,4,49.0 "a = predict(test_2, test_2_loader, bst_model)",No,5,48.0 "bla = test_2[['ID']].merge(pd.DataFrame(torch.nn.functional.softmax(torch.tensor(a), dim = 1).numpy()), left_index=True, right_index=True).set_index('ID') bla.columns = ['A', 'B', 'NEITHER'] bla.to_csv('sbmsn2.csv')",No,4,25.0 data.columns,No,5,71.0 data.info(),No,5,40.0 data['Type'].unique(),No,5,57.0 "import matplotlib.pyplot as plt import seaborn as sns plt.figure(figsize=(14,6)) plt.subplot(1,2,1) sns.boxplot(data.revenue) plt.subplot(1,2,2) sns.distplot(data.revenue, bins=20, kde=False) plt.show()",No,5,33.0 "#City distribution data[""City""].value_counts().plot(kind='bar')'",No,5,33.0 "data[""Type""].value_counts().plot(kind='bar')'",No,5,33.0 "data[""City Group""].value_counts().plot(kind='bar')'",No,5,33.0 "# Crrelation between revenue and feature (p)s def numFeaturePlot(): features=(data.loc[:,'P1':'P37']).columns.tolist() plt.figure(figsize=(35,18)) j=1 while j7000],No,5,14.0 "g = sns.distplot(data['Sales']) g.set_title(""Data Distribution"") '",No,5,33.0 "zero_sales = data[data['Sales']==0].copy() data = data[data['Sales']!=0].drop('Open', 1)",No,4,10.0 "fig, ax = plt.subplots (1,4, figsize=(20,4)) sns.barplot(['Size'], [len(zero_sales)], ax=ax[0]) sns.countplot('DayOfWeek', data=zero_sales, ax=ax[1]) sns.countplot('Open', data=zero_sales, ax=ax[2]) sns.countplot('Promo', data=zero_sales, ax=ax[3]) plt.tight_layout()",No,5,75.0 "plt.figure(figsize=(8,8)) sns.heatmap(data .corr(), cmap='coolwarm')",No,5,80.0 "data.fillna(0, inplace=True) test.fillna(1, inplace=True)",No,5,17.0 "# Combining train and test data # Decomposing date features data['part'] = 'train' test['part'] = 'test' all_data = pd.concat([data, test], 0)[data.columns.tolist()+['Id']] all_data['Date'] = pd.to_datetime(all_data['Date']) all_data['Month'] = all_data['Date'].dt.month all_data['Year'] = all_data['Date'].dt.year all_data['Day'] = all_data['Date'].dt.day all_data['WeekOfYear'] = data.Date.dt.weekofyear all_data['Quarter'] = data.Date.dt.quarter all_data.sort_values('Date', inplace=True)",No,5,8.0 all_data.head(),No,5,41.0 "tmp = all_data[all_data['part']=='train'] sns.regplot('CompetitionOpen', 'Sales', data=tmp, ci=None)",No,5,33.0 "b""all_data['PromoOpen'] = 12 * (all_data.Year - all_data.Promo2SinceYear) + \\\n (all_data.WeekOfYear - all_data.Promo2SinceWeek) / 4.0\nall_data['PromoOpen'] = all_data['PromoOpen'].apply(lambda x: x if x > 0 else 0)""",No,5,8.0 "tmp = all_data[all_data['part']=='train'] sns.regplot('PromoOpen', 'Sales', data=tmp, ci=None)",No,5,33.0 "df_store[pd.isnull(df_store.CompetitionDistance)] #rows with missing values for Competition Distance, only 3 rows with null which makes sense since 99.73% is filled",No,2,14.0 "df_store_check_distribution=df_store.drop(df_store[pd.isnull(df_store.CompetitionDistance)].index) fig, axes = plt.subplots(1, 2, figsize=(17,3.5)) axes[0].boxplot(df_store_check_distribution.CompetitionDistance, showmeans=True,vert=False,) axes[0].set_xlim(0,max(df_store_check_distribution.CompetitionDistance+1000)) axes[0].set_title('Boxplot For Closest Competition') axes[1].hist(df_store_check_distribution.CompetitionDistance, cumulative=False, bins=30) axes[1].set_title(""Closest Competition histogram"") axes[1].set_xlim((min(df_store_check_distribution.CompetitionDistance), max(df_store_check_distribution.CompetitionDistance))) {""Mean"":np.nanmean(df_store.CompetitionDistance),""Median"":np.nanmedian(df_store.CompetitionDistance),""Standard Dev"":np.nanstd(df_store.CompetitionDistance)}#That's what i thought, very different values, let's see why '",No,2,33.0 "df_store['CompetitionDistance'].fillna(df_store['CompetitionDistance'].median(), inplace = True)",No,5,17.0 "df_store.CompetitionOpenSinceMonth.fillna(0, inplace = True) df_store.CompetitionOpenSinceYear.fillna(0,inplace=True)",No,5,17.0 "df_store.Promo2SinceWeek.fillna(0,inplace=True) df_store.Promo2SinceYear.fillna(0,inplace=True) df_store.PromoInterval.fillna(0,inplace=True)",No,5,17.0 "#Left-join the train to the store dataset since .Why? #Because you want to make sure you have all events even if some of them don't have their store information ( which shouldn't happen) df_train_store = pd.merge(df_train, df_store, how = 'left', on = 'Store') df_train_store.head() print (""The Train_Store dataset has {} Rows and {} Variables"".format(str(df_train_store.shape[0]),str(df_train_store.shape[1]))) '",No,4,32.0 df_train_store['SalesperCustomer']=df_train_store['Sales']/df_train_store['Customers'],No,5,8.0 df_train_store.head(),No,5,41.0 "fig, axes = plt.subplots(2, 3,figsize=(17,10) ) palette = itertools.cycle(sns.color_palette(n_colors=4)) plt.subplots_adjust(hspace = 0.28) #axes[1].df_train_store.groupby(by=""StoreType"").count().Store.plot(kind='bar') axes[0,0].bar(df_store.groupby(by=""StoreType"").count().Store.index,df_store.groupby(by=""StoreType"").count().Store,color=[next(palette),next(palette),next(palette),next(palette)]) axes[0,0].set_title(""Number of Stores per Store Type \ Fig 1.1"") axes[0,1].bar(df_train_store.groupby(by=""StoreType"").sum().Sales.index,df_train_store.groupby(by=""StoreType"").sum().Sales/1e9,color=[next(palette),next(palette),next(palette),next(palette)]) axes[0,1].set_title(""Total Sales per Store Type (in Billions) \ Fig 1.2"") axes[0,2].bar(df_train_store.groupby(by=""StoreType"").sum().Customers.index,df_train_store.groupby(by=""StoreType"").sum().Customers/1e6,color=[next(palette),next(palette),next(palette),next(palette)]) axes[0,2].set_title(""Total Number of Customers per Store Type (in Millions) \ Fig 1.3"") axes[1,0].bar(df_train_store.groupby(by=""StoreType"").sum().Customers.index,df_train_store.groupby(by=""StoreType"").Sales.mean(),color=[next(palette),next(palette),next(palette),next(palette)]) axes[1,0].set_title(""Average Sales per Store Type \ Fig 1.4"") axes[1,1].bar(df_train_store.groupby(by=""StoreType"").sum().Customers.index,df_train_store.groupby(by=""StoreType"").Customers.mean(),color=[next(palette),next(palette),next(palette),next(palette)]) axes[1,1].set_title(""Average Number of Customers per Store Type \ Fig 1.5"") axes[1,2].bar(df_train_store.groupby(by=""StoreType"").sum().Sales.index,df_train_store.groupby(by=""StoreType"").SalesperCustomer.mean(),color=[next(palette),next(palette),next(palette),next(palette)]) axes[1,2].set_title(""Average Spending per Customer in each Store Type \ Fig 1.6"") plt.show()'",No,3,33.0 "StoretypeXAssortment = sns.countplot(x=""StoreType"",hue=""Assortment"",order=[""a"",""b"",""c"",""d""], data=df_store,palette=sns.color_palette(""Set2"", n_colors=3)).set_title(""Number of Different Assortments per Store Type"") df_store.groupby(by=[""StoreType"",""Assortment""]).Assortment.count() ",No,2,33.0 "df_train_store['Month']=df_train_store.Date.dt.month df_train_store['Year']=df_train_store.Date.dt.year",No,3,8.0 " sns.factorplot(data = df_train_store, x =""Month"", y = ""Sales"", col = 'Promo', # per store type in cols hue = 'Promo2', row = ""Year"" ,sharex=False) '",No,4,33.0 "sns.factorplot(data = df_train_store, x =""Month"", y = ""SalesperCustomer"", col = 'Promo', # per store type in cols hue = 'Promo2', row = ""Year"" ,sharex=False)'",No,2,33.0 "sns.factorplot(data = df_train_store, x =""DayOfWeek"", y = ""Sales"", hue='Promo' ,sharex=False)'",No,2,33.0 "#33 Stores are opened on Sundays print (""Number of Stores opened on Sundays:{}"" .format(df_train_store[(df_train_store.Open == 1) & (df_train_store.DayOfWeek == 7)]['Store'].unique().shape[0]))'",No,3,54.0 "df_train_store['CompetitionDist_Cat']=pd.cut(df_train_store['CompetitionDistance'], 5)",No,4,13.0 "df_train_store.groupby(by=""CompetitionDist_Cat"").Sales.mean(),df_train_store.groupby(by=""CompetitionDist_Cat"").Customers.mean()",No,5,60.0 "del df_train_store[""CompetitionDist_Cat""]",No,5,10.0 df_train_store['Day']=df_train_store.Date.dt.day,No,5,8.0 "del df_train_store[""Date""]",No,5,10.0 "df_train_store['StoreType'].isnull().any(),df_train_store['Assortment'].isnull().any(),df_train_store['StateHoliday'].isnull().any() #No Null values we can proceed with the transformation",No,2,39.0 "df_train_store[""StoreType""].value_counts(),df_train_store[""Assortment""].value_counts(),df_train_store[""StateHoliday""].value_counts()",No,4,54.0 "df_train_store['StateHoliday'] = df_train_store['StateHoliday'].astype('category') df_train_store['Assortment'] = df_train_store['Assortment'].astype('category') df_train_store['StoreType'] = df_train_store['StoreType'].astype('category') df_train_store['PromoInterval']= df_train_store['PromoInterval'].astype('category')",No,5,16.0 "df_train_store['StateHoliday_cat'] = df_train_store['StateHoliday'].cat.codes df_train_store['Assortment_cat'] = df_train_store['Assortment'].cat.codes df_train_store['StoreType_cat'] = df_train_store['StoreType'].cat.codes df_train_store['PromoInterval_cat'] = df_train_store['PromoInterval'].cat.codes ",No,3,16.0 "df_train_store['StateHoliday_cat'] = df_train_store['StateHoliday_cat'].astype('float') df_train_store['Assortment_cat'] = df_train_store['Assortment_cat'].astype('float') df_train_store['StoreType_cat'] = df_train_store['StoreType_cat'].astype('float') df_train_store['PromoInterval_cat'] = df_train_store['PromoInterval_cat'].astype('float')",No,5,16.0 "from argparse import Namespace #There are 1115 stores. Select a small sample to do experimentation on. Select all for full training. num_sample_stores=1115 #The test set is 47 days. Normally use the last 47 days of the training data for validation. Se to 0 and use all data for traing when submitting to kaggle valid_days=0 #Hyperparameters s= Namespace( **{ ""l1"":4497, ""l2"":2328, ""ps1"":0.2771132028380148, ""ps2"":0.15631474446268287, ""emb_drop"":0.14301109844119272, ""batchsize"":64, ""lrate"":0.0660858230905056, ""lrate_ratio"":9, ""wd"":0.17305139150930285, ""l1epoch"":4, ""l2epoch"":3, ""l3epoch"":8, }) ",No,5,59.0 "from pathlib import Path from datetime import datetime, timedelta import numpy as np import pandas as pd from fastai import * from fastai.tabular import * #display results import plotly import plotly.plotly as py import plotly.graph_objs as go import cufflinks as cf ",No,5,22.0 "plotly.offline.init_notebook_mode(connected=False) cf.go_offline() %matplotlib inline %reload_ext autoreload %autoreload 2 pd.set_option('display.max_columns', 0) pd.set_option('display.max_rows', 500)",No,5,23.0 !ls ../input/,No,5,88.0 "path=Path(""../input/rossmann-data-engineering/"") traindf=pd.read_feather(path/""train.feather"") testdf=pd.read_feather(path/""test.feather"")",No,5,44.0 "best_type= data.sort_values('revenue', ascending=False) plt.figure(figsize=(13,12)) sns.barplot(x=best_type['Type'], y=best_type['revenue'])",No,4,9.0 data,No,5,41.0 "#Select size validation set based on valid_days variable from datetime import datetime, timedelta valid_idx=traindata[traindata.Date>=(traindata.Date.max()- timedelta(days=valid_days))].index.tolist()",No,4,8.0 "#Convert datetime columns to int64 for traning datecols=traindata.select_dtypes(include=""datetime"").columns.tolist() traindata[datecols]=traindata[datecols].astype(""int64"") testdf[datecols]=testdf[datecols].astype(""int64"")",No,5,16.0 "procs = [FillMissing, Categorify, Normalize] dep_var = 'Sales' #cont_names,cat_names= cont_cat_split(sample_train,dep_var=""Sales"") cont_names=[ 'CompetitionDistance', 'Week', 'Day', 'Dayofyear', 'Elapsed', 'ratio-sales-customer', 'ratio-saturday-week', 'ratio-sunday-week', 'ratio-promo-nopromo', 'Promo_thisweek', 'Open_thisweek', 'StateHolidayBool_thisweek', 'SchoolHoliday_thisweek', 'Promo_prevweek', 'Open_prevweek', 'StateHolidayBool_prevweek', 'SchoolHoliday_prevweek', 'Promo_nextweek', 'Open_nextweek', 'StateHolidayBool_nextweek', 'SchoolHoliday_nextweek', 'Promo2Days', 'CompetitionDaysOpen', 'trend', 'trend_DE', 'Max_Humidity', 'Max_Wind_SpeedKm_h', 'Mean_Humidity', 'Mean_TemperatureC', 'Max_TemperatureC_chnage', 'Month_Sales_mean', 'Year_Sales_mean', 'Dayofweek_Sales_mean', 'Dayofweek_promo_Sales_mean', 'BeforeSchoolHoliday', 'AfterSchoolHoliday', 'BeforeClosed', 'AfterClosed', 'BeforePromo', 'AfterPromo', 'BeforeStateHolidayBool', 'AfterStateHolidayBool', 'Promo2ActiveMonthBool', 'BeforePromo2ActiveMonthBool', 'AfterPromo2ActiveMonthBool', 'SchoolHoliday_fw', 'StateHolidayBool_fw', 'Promo_fw', 'Closed_fw', 'Promo2ActiveMonthBool_fw', 'CompetitionOpenSince', 'Promo2Since' ] cat_names=[ 'Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'Promo2', 'PromoInterval', 'Year', 'Month', 'Dayofweek', 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start', 'Promo2SinceYear', 'Promo2Na', 'Events', 'Fog', 'Hail', 'Rain', 'Snow', 'Thunderstorm', 'Quarter', 'CompetitionOpenNA', 'CompetitionDistanceNA', 'CompetitionOpenSinceYear', 'State' ] '",No,3,21.0 "max_log_y = np.log(np.max(traindata['Sales']))#*1.2 y_range = torch.tensor([0, max_log_y], device=defaults.device)",No,4,21.0 "databunch = (TabularList.from_df(traindata, path="""", cat_names=cat_names, cont_names=cont_names, procs=procs,) .split_by_idx(valid_idx) .label_from_df(cols=dep_var, label_cls=FloatList, log=True) .add_test(TabularList.from_df(testdf, path=path, cat_names=cat_names, cont_names=cont_names)) .databunch()) databunch.batch_size=s.batchsize",No,4,13.0 "learn = tabular_learner(databunch, layers=[s.l1,s.l2], ps=[s.ps1,s.ps2], emb_drop=s.emb_drop, y_range=y_range, metrics=exp_rmspe)",No,5,4.0 learn.lr_find(),No,5,2.0 learn.recorder.plot(),No,5,35.0 "learn.fit_one_cycle(s.l1epoch, s.lrate, wd=s.wd)",No,5,7.0 "learn.fit_one_cycle(s.l2epoch, s.lrate/s.lrate_ratio, wd=s.wd)",No,5,7.0 "learn.fit_one_cycle(s.l3epoch, s.lrate/(s.lrate_ratio*s.lrate_ratio), wd=s.wd)",No,5,7.0 "valid_preds=learn.get_preds(DatasetType.Valid) traindata[""SalesPreds""]=pd.Series(index=traindata.iloc[valid_idx].index,data=np.exp(valid_preds[0].numpy().T[0]))",No,4,48.0 "#Define error function def rmspe_metric(act,pred): return np.sqrt(np.mean(((act-pred)/act)**2))",No,5,84.0 "rmspe_metric(traindata.Sales,traindata.SalesPreds)",No,5,28.0 "#Sort stores by how much error store_rmspe=traindata.groupby([""Store""]).apply(lambda x:rmspe_metric(x.Sales,x.SalesPreds)).sort_values(ascending=False)",No,5,28.0 "store_rmspe.iplot(kind=""histogram"")",No,5,33.0 store_rmspe[:10],No,5,41.0 "t=traindata.set_index(""Date"")",No,5,61.0 "#Stores with most error for store in store_rmspe.index[:4].tolist(): t[t.Store==store][[""Sales"",""SalesPreds""]].iplot(kind=""bar"",barmode=""overlay"",title=""Store {}"".format(store))",No,5,33.0 "#Stores with least error for store in store_rmspe.index[-4:].tolist(): t[t.Store==store][[""Sales"",""SalesPreds""]].iplot(kind=""bar"",barmode=""overlay"",title=""Store {}"".format(store))",No,5,33.0 "test_preds=learn.get_preds(DatasetType.Test) testdf[""Sales""]=np.exp(test_preds[0].data).numpy().T[0] testdf[[""Id"",""Sales""]]=testdf[[""Id"",""Sales""]].astype(""int"") testdf[[""Id"",""Sales""]].to_csv(""rossmann_submission.csv"",index=False)",No,3,25.0 "import pandas as pd import numpy as np import matplotlib.pyplot as plt from xgboost import XGBRegressor",No,5,22.0 "train_data = pd.read_csv(""../input/train.csv"",low_memory= False) test_data = pd.read_csv(""../input/test.csv"",low_memory= False) store_data = pd.read_csv(""../input/store.csv"",low_memory= False) test_copy = test_data",No,5,45.0 "print(""Shape of Train data :"", train_data.shape) print(""Shape of Test data :"", test_data.shape) print(""Shape of Store data :"", store_data.shape)",No,5,58.0 train_data.head(),No,5,41.0 store_data.head(100),No,5,41.0 train_data.isnull().sum(),No,5,39.0 test_data.isnull().sum(),No,5,39.0 store_data.isnull().sum().sort_values(ascending = False),No,5,39.0 store_data['Promo2SinceWeek'].unique(),No,5,57.0 train_data['Store'].unique(),No,5,57.0 train_data['DayOfWeek'].unique(),No,5,57.0 train_data['Open'].unique(),No,5,57.0 train_data['StateHoliday'].unique(),No,5,57.0 train_data['Promo'].unique(),No,5,57.0 store_data['CompetitionOpenSinceMonth'].unique(),No,5,57.0 "print(sum(train_data[""Open""] == 0)) print(sum(train_data[""Open""] == 1))",No,5,72.0 "print(sum(test_data[""Open""] == 0)) print(sum(test_data[""Open""] == 1))",No,5,72.0 "print(sum(train_data[""StateHoliday""] == 'a')) print(sum(train_data[""StateHoliday""] == 'b')) print(sum(train_data[""StateHoliday""] == 'c')) print(sum(train_data[""StateHoliday""] == 0))'",No,5,72.0 "plt.plot(train_data['DayOfWeek'],train_data['Customers'])",No,5,81.0 "train_data[['Sales','Customers','Promo','SchoolHoliday']].corr(method='pearson')",No,5,40.0 "train_data['Mon'] = train_data[""Date""].apply(lambda x : int(x[5:7])) train_data['Yr'] = train_data[""Date""].apply(lambda x : int(x[:4])) train_data[""HolidayBin""] = train_data.StateHoliday.map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})'",No,4,16.0 "test_data['Mon'] = test_data[""Date""].apply(lambda x : int(x[5:7])) test_data['Yr'] = test_data[""Date""].apply(lambda x : int(x[:4])) test_data[""HolidayBin""] = test_data.StateHoliday.map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})'",No,5,16.0 "train_data = train_data.merge(store_data) test_data =test_data.merge(store_data)",No,5,32.0 train_data.isnull().sum().sort_values(ascending= False),No,5,39.0 test_data.isnull().sum().sort_values(ascending= False),No,5,39.0 test_data[test_data['Open'].isnull()],No,5,14.0 "for i in train_data['Promo2SinceWeek'].unique() : print(i ,':', sum(train_data['Promo2SinceWeek'] == i )) ",No,5,72.0 "for i in train_data['CompetitionOpenSinceMonth'].unique() : print(i ,':', sum(train_data['CompetitionOpenSinceMonth'] == i ))",No,5,72.0 "for i in train_data['Promo2SinceYear'].unique() : print(i ,':', sum(train_data['Promo2SinceYear'] == i ))",No,5,72.0 "for i in train_data['CompetitionOpenSinceYear'].unique() : print(i ,':', sum(train_data['CompetitionOpenSinceYear'] == i ))",No,5,72.0 "train_data = train_data.drop(['Customers', 'Store','Date','StateHoliday'],axis= 1 ) test_data = test_data.drop(['Date','StateHoliday','Store','Id'],axis= 1 )",No,5,10.0 sum(train_data['Open'] == 0),No,5,72.0 train_data = train_data.drop(train_data[train_data['Open'] == 0].index.tolist()),No,5,10.0 train_data[train_data['HolidayBin'].isnull()],No,5,14.0 "train_data['CompetitionOpenSinceMonth'] = train_data['CompetitionOpenSinceMonth'].fillna(9.0) train_data['HolidayBin'] = train_data['HolidayBin'].fillna(0) train_data['Promo2SinceWeek'] = train_data['Promo2SinceWeek'].fillna(40.0) train_data['Promo2SinceYear'] = train_data['Promo2SinceYear'].fillna(2012.0) train_data['CompetitionOpenSinceYear'] = train_data['CompetitionOpenSinceYear'].fillna(2012.0) train_data['CompetitionDistance'] = train_data['CompetitionDistance'].fillna(train_data['CompetitionDistance'].mean()) train_data.isnull().sum().sort_values(ascending = False)",No,5,17.0 "test_data['Open'] = test_data['Open'].fillna(1) test_data['CompetitionOpenSinceMonth'] = test_data['CompetitionOpenSinceMonth'].fillna(9.0) test_data['CompetitionDistance'] = test_data['CompetitionDistance'].fillna(train_data['CompetitionDistance'].mean()) test_data['CompetitionOpenSinceYear'] = test_data['CompetitionOpenSinceYear'].fillna(2012.0) test_data['Promo2SinceWeek'] = test_data['Promo2SinceWeek'].fillna(40.0) test_data['Promo2SinceYear'] = test_data['Promo2SinceYear'].fillna(2012.0) test_data.isnull().sum().sort_values(ascending = False)",No,4,17.0 sum(train_data['Sales'] < 0 ),No,5,72.0 train_data.head(100),No,5,41.0 "categorical_train = train_data.columns.tolist() print(categorical_train) train_data[categorical_train].corr(method='pearson')",No,3,80.0 "train_features = train_data.drop(['Open'],axis = 1) categorical_train = train_features.columns.tolist() print(categorical_train) train_data[categorical_train].corr(method='pearson') train_features = train_data.drop(['Sales'],axis = 1) full_features = pd.concat([train_features,test_data],ignore_index= True) print(train_features.shape) print(test_data.shape)",No,2,80.0 full_features.head(),No,5,41.0 full_features.shape,No,5,58.0 "full_features = pd.get_dummies(full_features,columns= ['HolidayBin','Assortment','StoreType'])",No,5,20.0 "full_features = full_features.drop('PromoInterval',axis = 1)",No,5,10.0 "train_features = full_features.iloc[:844392,:].values test_data = full_features.iloc[844392:,:].values train_sales = train_data['Sales'].values",No,5,13.0 "print(train_features.shape) print(train_sales.shape) print(test_data.shape)",No,5,58.0 "xgboost = XGBRegressor(learning_rate=0.009, n_estimators=500, max_depth=10, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.7, objective='reg:linear', nthread=-1, scale_pos_weight=1, seed=27, reg_alpha=0.00006, random_state=42)",No,5,4.0 "xgboost.fit(train_features,train_sales) ",No,5,7.0 predictions = xgboost.predict(test_data),No,5,48.0 "pred_df = pd.DataFrame({""Id"": test_copy[""Id""], 'Sales': predictions}) pred_df.to_csv(""xgboost_4_submission.csv"", index=False)'",No,4,25.0 "import os import string import numpy as np import pandas as pd from pandasql import sqldf import matplotlib.pyplot as plt from keras.utils.np_utils import to_categorical from keras.models import Model, Sequential, model_from_json from keras.optimizers import SGD, Adam, RMSprop from keras.layers import Input, Dense, Dropout, Flatten, Lambda, Embedding from keras.initializers import RandomNormal, Constant from keras.callbacks import ModelCheckpoint, EarlyStopping from keras import regularizers from keras import backend as K import tensorflow as tf from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error import seaborn as sns import warnings from math import sqrt import itertools from tqdm import tqdm np.random.seed(42) # for reproducibility sns.set(style=""whitegrid"", color_codes=True) sns.set(font_scale=1) pd.set_option('display.max_columns', 60) %matplotlib inline warnings.filterwarnings('ignore')'",No,5,23.0 "b""def concat_data():\n df_train = pd.read_csv('../input/train.csv')\n df_test = pd.read_csv('../input/test.csv')\n df_extra = pd.read_csv('../input/store.csv')\n df_test['Sales'] = -1\n df_full = pd.concat([df_train, df_test]).reset_index(drop=True)\n\n #Merge extra information about stores\n df_full = df_full.merge(df_extra, left_on=['Store'], right_on=['Store'], how='left')\n \n df_full['Year'] = pd.DatetimeIndex(df_full['Date']).year\n df_full['Month'] = pd.DatetimeIndex(df_full['Date']).month\n df_full['Day'] = pd.DatetimeIndex(df_full['Date']).day\n df_full['WeekOfYear'] = pd.DatetimeIndex(df_full['Date']).weekofyear\n \n # Calculate competition open in months\n df_full['CompetitionOpen'] = 12 * (df_full.Year - df_full.CompetitionOpenSinceYear) + \\\n (df_full.Month - df_full.CompetitionOpenSinceMonth)\n\n # Calculate promo open time in months\n df_full['PromoOpen'] = 12 * (df_full.Year - df_full.Promo2SinceYear) + \\\n (df_full.WeekOfYear - df_full.Promo2SinceWeek) / 4.0\n df_full['PromoOpen'] = df_full.PromoOpen.apply(lambda x: x if x > 0 else 0)\n df_full.loc[df_full.Promo2SinceYear == 0, 'PromoOpen'] = 0\n\n # Transform month interval in a boolean column \n month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',\n 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}\n df_full['monthStr'] = df_full.Month.map(month2str)\n df_full.loc[df_full.PromoInterval == 0, 'PromoInterval'] = ''\n df_full['IsPromoMonth'] = 0\n for interval in df_full.PromoInterval.unique():\n interval = str(interval)\n if interval != '':\n for month in interval.split(','):\n df_full.loc[(df_full.monthStr == month) & (df_full.PromoInterval == interval), 'IsPromoMonth'] = 1\n\n\n return df_full\n\ndf_full = concat_data()""",Yes,2,45.0 "def extrat_test_data(df_full): df_train = df_full.loc[df_full['Sales'] != -1] df_test = df_full.loc[df_full['Sales'] == -1] return df_train, df_test df_train, df_test = extrat_test_data(df_full)",No,4,13.0 df_full.head(),No,5,41.0 "# Function to calculate missing values by column (By DSA) def missing_values_table(df): # Total missing values mis_val = df.isnull().sum() # Percentage of missing values mis_val_percent = 100 * df.isnull().sum() / len(df) # Make a table with the results mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1) # Rename the columns mis_val_table_ren_columns = mis_val_table.rename( columns = {0 : 'Missing Values', 1 : '% of Total Values'}) # Sort the table by percentage of missing descending mis_val_table_ren_columns = mis_val_table_ren_columns[ mis_val_table_ren_columns.iloc[:,1] != 0].sort_values( '% of Total Values', ascending=False).round(1) # Print some summary information print (""Your selected dataframe has "" + str(df.shape[1]) + "" columns.\ "" ""There are "" + str(mis_val_table_ren_columns.shape[0]) + "" columns that have missing values."") # Return the dataframe with missing information return mis_val_table_ren_columns missing_values_table(df_full)'",No,4,39.0 df_train_store.dtypes,No,5,70.0 df_full.groupby('StoreType')['Sales'].describe(),No,5,40.0 "df_full.groupby('StoreType')['Customers', 'Sales'].sum()",No,5,60.0 "# Plotting correlations num_feat=df_full.columns[df_full.dtypes!=object] num_feat=num_feat[1:-1] labels = [] values = [] for col in num_feat: labels.append(col) values.append(np.corrcoef(df_full[col].values, df_full['Sales'].values)[0,1]) ind = np.arange(len(labels)) width = 0.9 fig, ax = plt.subplots(figsize=(10,15)) rects = ax.barh(ind, np.array(values), color='red') ax.set_yticks(ind+((width)/2.)) ax.set_yticklabels(labels, rotation='horizontal') ax.set_xlabel(""Correlation coefficient"") ax.set_title(""Correlation Coefficients w.r.t Sales"")'",No,3,80.0 "# Heatmap of correlations features corrMatrix=df_full[[""Sales"", ""DayOfWeek"", ""Open"", ""Promo"", ""SchoolHoliday"", ""CompetitionDistance"", ""CompetitionOpenSinceMonth"", ""CompetitionOpenSinceYear"", ""Promo2"", ""Promo2SinceWeek"", ""Promo2SinceYear"", ""Year"", ""Month"", ""Day"", ""CompetitionOpen"", ""PromoOpen"", ""IsPromoMonth"", ""Store""]].corr() sns.set(font_scale=1.10) plt.figure(figsize=(30, 30)) sns.heatmap(corrMatrix, vmax=.8, linewidths=0.01, square=True,annot=True,cmap='viridis',linecolor=""white"") plt.title('Correlation between features')'",No,4,80.0 "def clean_data(use_text_columns = True): ''' Function that clean data and create a new features to enrich the model ''' cols_num = [""Sales"", ""DayOfWeek"", ""Open"", ""Promo"", ""SchoolHoliday"", ""CompetitionDistance"", ""CompetitionOpenSinceMonth"", ""CompetitionOpenSinceYear"", ""Promo2"", ""Promo2SinceWeek"", ""Promo2SinceYear"", ""Wapp"", ""Avg_Customers"", ""Year"", ""Month"", ""Day"", ""CompetitionOpen"", ""PromoOpen"", ""IsPromoMonth"", ""Store""] cols_text = [""StateHoliday"", ""StoreType"", ""Assortment""] df_train = pd.read_csv('../input/train.csv') len_train_data = len(df_train) df_test = pd.read_csv('../input/test.csv') # Setting null values of column Open in test dataset df_test.loc[df_test['DayOfWeek'] != 7, 'Open'] = 1 df_test.loc[df_test['DayOfWeek'] == 7, 'Open'] = 0 avg_customer = sqldf( """""" SELECT Store, DayOfWeek, sum(case when Customers is not null then Sales/Customers else 0 end) as Wapp, round(avg(Customers)) Avg_Customers from df_train group by Store,DayOfWeek """""" ) df_test = sqldf( """""" SELECT t.*, ac.Wapp, ac.Avg_Customers from df_test t left join avg_customer ac on t.Store = ac.Store and t.DayOfWeek = ac.DayOfWeek """""" ) df_train = sqldf( """""" SELECT t.*, ac.Wapp, ac.Avg_Customers from df_train t left join avg_customer ac on t.Store = ac.Store and t.DayOfWeek = ac.DayOfWeek """""" ) # Merge train and test dataset all_data = pd.concat([df_train, df_test], ignore_index=True) df_extra = pd.read_csv('../input/store.csv') df_full = pd.concat([df_train, df_test]).reset_index(drop=True) # Merge extra information about stores all_data = df_full.merge(df_extra, left_on=['Store'], right_on=['Store'], how='left') # Separate date in Year, Month and Day all_data.loc[all_data['StateHoliday'] == 0, 'StateHoliday'] = 'd' all_data['Year'] = pd.DatetimeIndex(all_data['Date']).year all_data['Month'] = pd.DatetimeIndex(all_data['Date']).month all_data['Day'] = pd.DatetimeIndex(all_data['Date']).day all_data['WeekOfYear'] = pd.DatetimeIndex(all_data['Date']).weekofyear # Calculate competition open in months all_data['CompetitionOpen'] = 12 * (all_data.Year - all_data.CompetitionOpenSinceYear) + \\ (all_data.Month - all_data.CompetitionOpenSinceMonth) # Calculate promo open time in months all_data['PromoOpen'] = 12 * (all_data.Year - all_data.Promo2SinceYear) + \\ (all_data.WeekOfYear - all_data.Promo2SinceWeek) / 4.0 all_data['PromoOpen'] = all_data.PromoOpen.apply(lambda x: x if x > 0 else 0) all_data.loc[all_data.Promo2SinceYear == 0, 'PromoOpen'] = 0 # Transform month interval in a boolean column month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'} all_data['monthStr'] = all_data.Month.map(month2str) all_data.loc[all_data.PromoInterval == 0, 'PromoInterval'] = '' all_data['IsPromoMonth'] = 0 for interval in all_data.PromoInterval.unique(): interval = str(interval) if interval != '': for month in interval.split(','): all_data.loc[(all_data.monthStr == month) & (all_data.PromoInterval == interval), 'IsPromoMonth'] = 1 data_numeric = all_data[cols_num] # Fill NAN values # Only column CompetitionDistance is fill NaN with a median value data_numeric['CompetitionDistance'].fillna(data_numeric['CompetitionDistance'].median(), inplace = True) # Other values is fill with zero data_numeric.fillna(0, inplace = True) if (use_text_columns): data_text = all_data[cols_text] data_text = pd.get_dummies(data_text, dummy_na=False) complete_data = pd.concat([data_numeric, data_text], axis = 1) df_train = complete_data.iloc[:len_train_data,:] df_test = complete_data.iloc[len_train_data:,:] else: df_train = data_numeric.iloc[:len_train_data,:] df_test = data_numeric.iloc[len_train_data:,:] return df_train, df_test'",Yes,3,43.0 "def load_train_data(scaler_x, scaler_y): ''' Transform train data set and separate a test dataset to validate the model in the end of training and normalize data ''' X_train = train.drop([""Sales""], axis=1) # Features y_train = np.array(train[""Sales""]).reshape((len(X_train), 1)) # Targets X_train = scaler_x.fit_transform(X_train) y_train = scaler_y.fit_transform(y_train) X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.20, random_state=42) return (X_train, y_train), (X_test, y_test)'",No,3,13.0 "def load_test_data(): ''' Remove column of predictions and normalize data of submission test data set. ''' X_test = test.drop([""Sales""], axis=1) # Features X_test = StandardScaler().fit_transform(X_test) return X_test'",No,4,20.0 "b""# Show info of model\ndef show_info(model, X, y, log, weights = None):\n '''\n Show metrics about the evaluation model and plots about loss, rmse and rmspe\n '''\n if (log != None):\n # summarize history for loss\n plt.figure(figsize=(14,10))\n plt.plot(log.history['loss'])\n plt.plot(log.history['val_loss'])\n plt.title('Model Loss')\n plt.ylabel('loss')\n plt.xlabel('epoch')\n plt.legend(['train', 'test'], loc='upper left')\n plt.show()\n print('\\n')\n \n # summarize history for rmse\n plt.figure(figsize=(14,10))\n plt.plot(log.history['rmse'])\n plt.plot(log.history['val_rmse'])\n plt.title('Model RMSE')\n plt.ylabel('rmse')\n plt.xlabel('epoch')\n plt.legend(['train', 'test'], loc='upper left')\n plt.show()\n print('\\n')\n \n # summarize history for rmspe\n plt.figure(figsize=(14,10))\n plt.plot(log.history['rmspe'])\n plt.plot(log.history['val_rmspe'])\n plt.title('Model RMSPE')\n plt.ylabel('rmspe')\n plt.xlabel('epoch')\n plt.legend(['train', 'test'], loc='upper left')\n plt.show()\n\n if (weights != None):\n model.load_weights(weights)\n\n predictions = model.predict(X, verbose=1)\n\n mse = mean_squared_error(y, predictions)\n rmse = sqrt(mse)\n rmspe = rmspe_val(y, predictions)\n\n print('MSE: %.3f' % mse)\n print('RMSE: %.3f' % rmse)\n print('RMSPE: %.3f' % rmspe)""",Yes,3,28.0 "def rmspe_val(y_true, y_pred): ''' RMSPE calculus to validate evaluation metric about the model ''' return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true), axis=0))[0]",No,5,84.0 "def rmspe(y_true, y_pred): ''' RMSPE calculus to use during training phase ''' return K.sqrt(K.mean(K.square((y_true - y_pred) / y_true), axis=-1))",No,5,84.0 "def rmse(y_true, y_pred): ''' RMSE calculus to use during training phase ''' return K.sqrt(K.mean(K.square(y_pred - y_true)))",No,5,84.0 "def create_model(): ''' Create a neural network ''' initializer = RandomNormal(mean=0.0, stddev=0.05, seed=None) model = Sequential() model.add(Dense(512, input_dim=X_train.shape[1], activation=""relu"", kernel_initializer=initializer)) model.add(Dropout(0.4)) model.add(Dense(512, input_dim=X_train.shape[1], activation=""relu"", kernel_initializer=initializer)) model.add(Dropout(0.4)) model.add(Dense(512, input_dim=X_train.shape[1], activation=""relu"", kernel_initializer=initializer)) model.add(Dropout(0.4)) model.add(Dense(1, activation=""linear"", kernel_initializer=initializer)) adam = Adam(lr=1e-3, decay=1e-3) # Compile model model.compile(loss=""mean_squared_error"", optimizer=adam, metrics=[rmse, rmspe]) return model'",No,3,4.0 "train, test = clean_data(use_text_columns = True)",No,3,13.0 "# Hyperparameters and load data to train the model batch_size = 512 nb_epoch = 300 scaler_x = StandardScaler() scaler_y = StandardScaler() print('Loading data...') (X_train, y_train), (X_test, y_test) = load_train_data(scaler_x, scaler_y) print('Build model...') model = create_model() model.summary()",Yes,3,4.0 "print('Fit model...') filepath=""weights_rossmann.best.hdf5"" checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min') early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min') callbacks_list = [checkpoint, early_stopping] log = model.fit(X_train, y_train, validation_split=0.20, batch_size=batch_size, epochs=nb_epoch, shuffle=True, callbacks=callbacks_list)'",No,4,7.0 "show_info(model, X_test, y_test, log, weights='weights_rossmann.best.hdf5')",No,4,35.0 "test_data = load_test_data() df_teste = pd.read_csv('../input/test.csv')",No,5,45.0 "predict = model.predict(test_data) predict = scaler_y.inverse_transform(predict)",No,4,48.0 "submission = pd.DataFrame() submission['Id'] = df_teste[""Id""] submission['Sales'] = predict submission.to_csv('submission.csv', index=False)'",No,4,25.0 "df_correlation=df_train_store[['Store', 'DayOfWeek', 'Sales', 'Customers', 'Open', 'Promo', 'SchoolHoliday', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'SalesperCustomer', 'Month', 'Year', 'Day', 'StateHoliday_cat', 'Assortment_cat', 'StoreType_cat', 'PromoInterval_cat']]",No,5,77.0 "df_correlation=df_correlation.drop('Open', axis = 1)",No,5,10.0 "upper_triangle = np.zeros_like(df_correlation.corr(), dtype = np.bool) upper_triangle[np.triu_indices_from(upper_triangle)] = True #make sure we don't show half of the other triangle f, ax = plt.subplots(figsize = (15, 10)) sns.heatmap(df_correlation.corr(),ax=ax,mask=upper_triangle,annot=True, fmt='.2f',linewidths=0.5,cmap=sns.diverging_palette(10, 133, as_cmap=True))",No,3,80.0 df_train_store.columns,No,5,71.0 "df_train_store['CompetitionOpenSince'] = np.where((df_train_store['CompetitionOpenSinceMonth']==0) & (df_train_store['CompetitionOpenSinceYear']==0) , 0,(df_train_store.Month - df_train_store.CompetitionOpenSinceMonth) + (12 * (df_train_store.Year - df_train_store.CompetitionOpenSinceYear)) )",No,4,8.0 "#now that CompetitionOpenSince is created #we can get rid of `CompetitionOpenSinceYear` and `CompeitionOpenSinceMonth` del df_train_store['CompetitionOpenSinceYear'] del df_train_store['CompetitionOpenSinceMonth'] ",No,5,10.0 "df_train_store[""is_holiday_state""] = df_train_store['StateHoliday'].map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})'",No,4,8.0 del df_train_store['StateHoliday_cat'],No,5,10.0 "df_train_store=pd.get_dummies(df_train_store, columns=[""Assortment"", ""StoreType"",""PromoInterval""], prefix=[""is_Assortment"", ""is_StoreType"",""is_PromoInteval""])",No,5,20.0 "del df_train_store['Assortment_cat'] del df_train_store['StoreType_cat'] ",No,5,10.0 del df_train_store['PromoInterval_cat'],No,5,10.0 "df_test = pd.read_csv(""../input/test.csv"",sep=',', parse_dates=['Date'] , date_parser=str_to_date, low_memory = False) print (""The Test dataset has {} Rows and {} Variables"".format(str(df_test.shape[0]),str(df_test.shape[1])))'",No,3,45.0 "df_test.fillna(1, inplace = True) #11rows with Nans decided to leave them open since its one store 622 which is #usually open #Left-join the train to the store dataset since .Why? #Because you want to make sure you have all events even if some of them don't have their store information ( which shouldn't happen) df_test_store = pd.merge(df_test, df_store, how = 'left', on = 'Store') print (""The Test_Store dataset has {} Rows and {} Variables"".format(str(df_test_store.shape[0]),str(df_test_store.shape[1]))) df_test_store['Month']=df_test_store.Date.dt.month df_test_store['Year']=df_test_store.Date.dt.year df_test_store['Day']=df_test_store.Date.dt.day df_test_store['StateHoliday'] = df_test_store['StateHoliday'].astype('category') df_test_store['Assortment'] = df_test_store['Assortment'].astype('category') df_test_store['StoreType'] = df_test_store['StoreType'].astype('category') df_test_store['PromoInterval']= df_test_store['PromoInterval'].astype('category') df_test_store['StateHoliday_cat'] = df_test_store['StateHoliday'].cat.codes df_test_store['Assortment_cat'] = df_test_store['Assortment'].cat.codes df_test_store['StoreType_cat'] = df_test_store['StoreType'].cat.codes df_test_store['PromoInterval_cat'] = df_test_store['PromoInterval'].cat.codes df_test_store['StateHoliday_cat'] = df_test_store['StateHoliday_cat'].astype('float') df_test_store['Assortment_cat'] = df_test_store['Assortment_cat'].astype('float') df_test_store['StoreType_cat'] = df_test_store['StoreType_cat'].astype('float') df_test_store['PromoInterval_cat'] = df_test_store['PromoInterval_cat'].astype('float') df_test_store['CompetitionOpenSince'] = np.where((df_test_store['CompetitionOpenSinceMonth']==0) & (df_test_store['CompetitionOpenSinceYear']==0) , 0,(df_test_store.Month - df_test_store.CompetitionOpenSinceMonth) + (12 * (df_test_store.Year - df_test_store.CompetitionOpenSinceYear)) ) df_test_store[""is_holiday_state""] = df_test_store['StateHoliday'].map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1}) df_test_store=pd.get_dummies(df_test_store, columns=[""Assortment"", ""StoreType"",""PromoInterval""], prefix=[""is_Assortment"", ""is_StoreType"",""is_PromoInteval""]) '",Yes,3,16.0 "del df_test_store[""Date""] del df_test_store['CompetitionOpenSinceYear'] del df_test_store['CompetitionOpenSinceMonth'] '",No,5,10.0 del df_test_store['StateHoliday_cat'],No,5,10.0 "del df_test_store['Assortment_cat'] del df_test_store['StoreType_cat'] del df_test_store['PromoInterval_cat']",No,5,10.0 del df_test_store['StateHoliday'],No,5,10.0 del df_train_store['StateHoliday'],No,5,10.0 "def rmspe(y, yhat): rmspe = np.sqrt(np.mean( (y - yhat)**2 )) return rmspe",No,4,49.0 "features = df_train_store.drop(['Customers', 'Sales', 'SalesperCustomer'], axis = 1) #a rule of thumb is to transform my target value to log if i see the values are very dispersed which is the case #and then of course revert them with np.exp to their real values targets=np.log(df_train_store.Sales) ",Yes,3,10.0 "X_train, X_train_test, y_train, y_train_test = model_selection.train_test_split(features, targets, test_size=0.20, random_state=15) print (""Training and testing split was successful."") ",No,5,13.0 "rfr = RandomForestRegressor(n_estimators=10, criterion='mse', max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=4, random_state=31, verbose=0, warm_start=False) rfr.fit(X_train, y_train) ",No,3,7.0 " ''' params = {'max_depth':(4,6,8,10,12,14,16,20), 'n_estimators':(4,8,16,24,48,72,96,128), 'min_samples_split':(2,4,6,8,10)} #scoring_fnc = metrics.make_scorer(rmspe) #the dimensionality is high, the number of combinations we have to search is enormous, using RandomizedSearchCV # is a better option then GridSearchCV grid = model_selection.RandomizedSearchCV(estimator=rfr,param_distributions=params,cv=10) #choosing 10 K-Folds makes sure i went through all of the data and didn't miss any pattern.(takes time to run but is worth doing it) grid.fit(X_train, y_train) ''' #I AM NOT GOING TO RUN THIS CHUNK TO BE ABLE TO COMMIT AND RUN MY KERNEL ON KAGGLE",No,3,6.0 "#with the optimal parameters i got let's see how it behaves with the validation set rfr_val=RandomForestRegressor(n_estimators=128, criterion='mse', max_depth=20, min_samples_split=10, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=4, #setting n_jobs to 4 makes sure you're using the full potential of the machine you're running the training on random_state=35, verbose=0, warm_start=False) model_RF_test=rfr_val.fit(X_train,y_train)",No,3,7.0 yhat=model_RF_test.predict(X_train_test),No,5,53.0 plt.hist(yhat),No,5,33.0 "error=rmspe(y_train_test,yhat) error",No,5,49.0 "importances = rfr_val.feature_importances_ std = np.std([rfr_val.feature_importances_ for tree in rfr_val.estimators_], axis=0) indices = np.argsort(importances) palette1 = itertools.cycle(sns.color_palette()) # Store the feature ranking features_ranked=[] for f in range(X_train.shape[1]): features_ranked.append(X_train.columns[indices[f]]) # Plot the feature importances of the forest plt.figure(figsize=(10,15)) plt.title(""Feature importances"") plt.barh(range(X_train.shape[1]), importances[indices], color=[next(palette1)], align=""center"") plt.yticks(range(X_train.shape[1]), features_ranked) plt.ylabel('Features') plt.ylim([-1, X_train.shape[1]]) plt.show() '",Yes,2,33.0 "df_test_store1=df_test_store.drop(['Id'],axis=1) kaggle_yhat= model_RF_test.predict(df_test_store1) kaggle_preds= pd.DataFrame({'Id': df_test_store['Id'], 'Sales': np.exp(kaggle_yhat)}) kaggle_preds.to_csv(""Stefano_Zakher_RF_Rossman_Kaggle_submission.csv"", index = False) '",Yes,3,55.0 "import os import pandas as pd import numpy as np import scipy import warnings warnings.filterwarnings(action='ignore') # Plotting Library import seaborn as sns import matplotlib.pyplot as plt plt.style.use('Solarize_Light2') # Other Libraries from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from math import sqrt from scipy.stats import ttest_ind ,linregress , ttest_rel import statsmodels.api as sm from scipy.stats import probplot from scipy.stats import zscore from sklearn.metrics import r2_score from statsmodels.graphics.regressionplots import influence_plot from sklearn.preprocessing import PolynomialFeatures , StandardScaler from sklearn.pipeline import Pipeline from sklearn.decomposition import PCA from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.linear_model import Perceptron from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso ,ElasticNet from sklearn.ensemble import RandomForestRegressor from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.model_selection import train_test_split print(os.listdir(""../input""))'",No,4,88.0 "def ispromomonth(rows): # if not rows[0].isnull(): months = {} months = str(rows['PromoInterval']).split(',') if str(rows['month_str']) in months: return 1 else: return 0 def rmspe(y, yhat): return np.sqrt(np.mean((yhat/y-1) ** 2)) def rmspe_xg(yhat, y): y = np.expm1(y.get_label()) yhat = np.expm1(yhat) return ""rmspe"", rmspe(y,yhat) class Rossmann_: def __init__(self , train_data_path = '../input/train.csv' , test_data_path='../input/test.csv' , store_path='../input/store.csv' , nrows =100000): self.train_data_path = train_data_path self.test_data_path = test_data_path self.store_path = store_path self.read_size = nrows self.train_data_original = pd.read_csv(self.train_data_path , low_memory = False , nrows = self.read_size) self.test_data_original = pd.read_csv(self.test_data_path ,low_memory = False , nrows = self.read_size) self.store_data_original = pd.read_csv(self.store_path) self.start_preprocessing_train(self.train_data_original , self.store_data_original) self.start_preprocessing_test(self.test_data_original , self.store_data_original) def start_preprocessing_train(self , train_data , store): train_data.StateHoliday = train_data.StateHoliday.replace('0',0) train_data.StateHoliday = train_data.StateHoliday.replace('a',1) train_data.StateHoliday = train_data.StateHoliday.replace('b',2) train_data.StateHoliday = train_data.StateHoliday.replace('c',3) train_data['Date_Year'] = train_data['Date'].apply(lambda x: int(x[:4])) train_data['Date_Month'] = train_data['Date'].apply(lambda x: int(x[5:7])) train_data['Date_Day'] = train_data['Date'].apply(lambda x: int(x[8:])) train_data_m = pd.merge(train_data, store, on='Store') mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4} train_data_m.StoreType.replace(mappings, inplace=True) train_data_m.Assortment.replace(mappings, inplace=True) #Finding the week of the year train_data_m['Date'] = pd.to_datetime(train_data_m['Date'], errors='coerce') train_data_m['date_WeekOfYear'] = train_data_m.Date.dt.weekofyear #Combining the Week and Year for Competition and Promo train_data_m['Competition_Weeks'] = 12*(train_data_m.Date_Year - train_data_m.CompetitionOpenSinceYear ) + (train_data_m.Date_Month - train_data_m.CompetitionOpenSinceMonth) train_data_m['Promo_Weeks'] = 12*(train_data_m.Date_Year - train_data_m.Promo2SinceYear ) + (train_data_m.Date_Month - train_data_m.Promo2SinceWeek) train_data_m['Competition_Weeks'] = train_data_m['Competition_Weeks'].apply(lambda x: x if x > 0 else 0) train_data_m['Promo_Weeks'] = train_data_m['Promo_Weeks'].apply(lambda x: x if x > 0 else 0) # is promo month is the months the promo is valid so month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'} train_data_m['month_str'] = train_data_m.Date_Month.map(month2str) train_data_m['IsPromoMonth'] = train_data_m[[ 'PromoInterval' , 'month_str' ]].apply(ispromomonth , axis = 1) train_data_m.fillna(0, inplace=True) #updating the rows with sales>0 and customes>0 train_data_updated = train_data_m[train_data_m['Sales']>0] train_data_updated = train_data_updated[train_data_updated['Customers']>0] features = ['Store', 'DayOfWeek', 'Sales', 'Customers', 'Open', 'Promo','StateHoliday', 'SchoolHoliday', 'Date_Year', 'Date_Month', 'Date_Day','StoreType', 'Assortment', 'CompetitionDistance','CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2','Promo2SinceWeek', 'Promo2SinceYear', 'date_WeekOfYear', 'Competition_Weeks', 'Promo_Weeks', 'IsPromoMonth'] self.train_final = train_data_updated[features] cols = self.train_final.columns self.train_final = pd.DataFrame(StandardScaler().fit_transform(self.train_final) , columns = cols) def start_preprocessing_test(self , test_data , store): test_data.fillna(1 , inplace=True) # These are all the Oprations appied on the Data test_data['Date_Year'] = test_data['Date'].apply(lambda x: int(x[:4])) test_data['Date_Month'] = test_data['Date'].apply(lambda x: int(x[5:7])) test_data['Date_Day'] = test_data['Date'].apply(lambda x: int(x[8:])) test_data_m = pd.merge(test_data, store, on='Store') mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4} test_data_m.StoreType.replace(mappings, inplace=True) test_data_m.Assortment.replace(mappings, inplace=True) test_data_m.StateHoliday.replace(mappings, inplace=True) test_data_m['Date'] = pd.to_datetime(test_data_m['Date'], errors='coerce') test_data_m['date_WeekOfYear'] = test_data_m.Date.dt.weekofyear test_data_m['Competition_Weeks'] = 12*(test_data_m.Date_Year - test_data_m.CompetitionOpenSinceYear ) + (test_data_m.Date_Month - test_data_m.CompetitionOpenSinceMonth) test_data_m['Promo_Weeks'] = 12*(test_data_m.Date_Year - test_data_m.Promo2SinceYear ) + (test_data_m.Date_Month - test_data_m.Promo2SinceWeek) test_data_m['Competition_Weeks'] = test_data_m['Competition_Weeks'].apply(lambda x: x if x > 0 else 0) test_data_m['Promo_Weeks'] = test_data_m['Promo_Weeks'].apply(lambda x: x if x > 0 else 0) month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'} test_data_m['month_str'] = test_data_m.Date_Month.map(month2str) test_data_m['IsPromoMonth'] = test_data_m[[ 'PromoInterval' , 'month_str' ]].apply(ispromomonth , axis = 1) test_data_m.fillna(0, inplace=True) features = ['Store', 'DayOfWeek', 'Open', 'Promo','StateHoliday', 'SchoolHoliday', 'Date_Year', 'Date_Month', 'Date_Day','StoreType', 'Assortment', 'CompetitionDistance','CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2','Promo2SinceWeek', 'Promo2SinceYear', 'date_WeekOfYear', 'Competition_Weeks', 'Promo_Weeks', 'IsPromoMonth'] self.test_final = test_data_m[features] def prepare_sample_data(self , limit =100 , testing_limit = 30): self.data = self.train_final.sample(frac = 1 , random_state = 98).head(limit) self.test_data = self.train_final.sample(frac = 1 , random_state = 98).tail(testing_limit) def Linear_Regression(self): print('Creating Linear Regression Model Between Sales and Customers... ') lr = LinearRegression() lr.fit(self.data['Customers'].values.reshape(-1,1) , self.data['Sales'].values.reshape(-1,1)) print('Fitting Done on Model ... ') print(lr) r2_score = lr.score(self.data['Customers'].values.reshape(-1,1), self.data['Sales'].values.reshape(-1,1)) # print('R2 Score is ',r2_score) # print('Since the Model R2 Score is ',r2_score , ', the model explains ',round(r2_score*100,2) , ' % of the variation in GI') print('Coefficients for the linear regression problem is ',lr.coef_) print('Intersect Value is ',lr.intercept_) y_pred = lr.predict(self.data['Customers'].values.reshape(-1, 1)) rms = sqrt(mean_squared_error(self.data['Sales'].values.reshape(-1,1), y_pred)) ty_pred = lr.predict(self.test_data['Customers'].values.reshape(-1, 1)) trms = sqrt(mean_squared_error(self.test_data['Sales'].values.reshape(-1,1), ty_pred)) print('Root Mean Squared Error of Training Set is ',rms) print('Root Mean Squared Error of Testing Set is ',trms) # print('R2 Score of Training Set is ',r2_score(y_pred, self.data['Sales'].values.reshape(-1,1))) # print('R2 Score of Testing Set is ',r2_score(ty_pred, self.test_data['Sales'].values.reshape(-1,1))) plt.figure(figsize=(15,10)) plt.scatter(self.data['Customers'].values.reshape(-1, 1) , self.data['Sales'].values.reshape(-1,1) , color ='r' , label = 'Actual Values') plt.scatter(self.data['Customers'].values.reshape(-1, 1) , y_pred , color ='b' , label = 'Predicted') plt.plot(self.data['Customers'].values.reshape(-1, 1) , y_pred , color ='k' , label = 'Predicted Line') plt.xlabel('Customers Index') plt.ylabel('Sales Index') plt.legend() plt.savefig('Linear Regression Training.png') plt.figure(figsize=(15,10)) plt.scatter(self.test_data['Customers'].values.reshape(-1, 1) , self.test_data['Sales'].values.reshape(-1,1) , color ='g' , label = 'Actual Values') plt.scatter(self.test_data['Customers'].values.reshape(-1, 1) , ty_pred , color ='y' , label = 'Predicted') plt.plot(self.test_data['Customers'].values.reshape(-1, 1) , ty_pred , color ='k' , label = 'Predicted Line') plt.xlabel('Customers Index') plt.ylabel('Sales Index') plt.legend() plt.savefig('Linear Regression Testing.png') def display_graphs(self,simple = False , orders = 1): for i in range(1, orders+1,1): lm = sns.lmplot(x =""Customers"", y =""Sales"", data = self.data, scatter = True, order = i, fit_reg = True, ci = 95 ) lm.fig.suptitle(""Scatter plot with Order = ""+str(i), fontsize=16) def Mulitple_Linear_Regression(self): print('Creating Multiple Linear Regression Model... ') print('Using Columns -> ',self.data.drop(columns = ['Sales','Customers']).columns) lr = LinearRegression() lr.fit(self.data.drop(columns = ['Sales','Customers']).values , self.data['Sales'].values) print(lr) print('Fitting Done on Model ... ') print('Coefficients for the linear regression problem is ',lr.coef_) print('Intersect Value is ',lr.intercept_) y_pred = lr.predict(self.data.drop(columns = ['Sales','Customers']).values) rms = sqrt(mean_squared_error(self.data['Sales'].values, y_pred)) ty_pred = lr.predict(self.test_data.drop(columns = ['Sales','Customers']).values) trms = sqrt(mean_squared_error(self.test_data['Sales'].values, ty_pred)) print('Root Mean Squared Error of Training Set is ',rms) print('Root Mean Squared Error of Testing Set is ',trms) # print('R2 Score of Training Set is ',r2_score(y_pred, self.data['Sales'].values.reshape(-1,1))) # print('R2 Score of Testing Set is ',r2_score(ty_pred, self.test_data['Sales'].values.reshape(-1,1))) self.data['pred'] = y_pred self.test_data['pred'] = ty_pred plt.figure(figsize=(15,10)) sns.jointplot(x = 'Sales' , y = 'pred' , data = self.data, height=10, ratio=3 , color='g' ) plt.savefig('Multiple Linear Regression Training.png') plt.figure(figsize=(15,10)) sns.jointplot(x = 'Sales' , y = 'pred' , data = self.test_data, height=10, ratio=3 , color='r' ) plt.savefig('Multiple Linear Regression Testing.png') # plt.figure(figsize=(15,10)) # plt.scatter(self.test_data['Customers'].values.reshape(-1,1) , self.test_data['Sales'].values.reshape(-1,1) , color ='g',label = 'Actual Values') # plt.scatter(self.test_data['Customers'].values.reshape(-1,1) , ty_pred , color ='y', label = 'Predicted') # plt.plot(self.test_data['Customers'].values.reshape(-1,1) , ty_pred , color ='k' , label = 'Predicted Line') # plt.xlabel('Customers Index') # plt.ylabel('Sales Index') # plt.legend() # plt.savefig('Multiple Linear Regression Testing.png') def Polynomial_Regression(self , degrees = 4): print('To Reduce Complexity...\ Using Single Data Column Customers Rather than All...') Input=[('polynomial',PolynomialFeatures(degree=degrees)),('modal',LinearRegression())] lr=Pipeline(Input) lr.fit(self.data['Customers'].values.reshape(-1,1) , self.data['Sales'].values.reshape(-1,1)) print('Fitting Done on Model ... ') r2_score = lr.score(self.data['Customers'].values.reshape(-1,1), self.data['Sales'].values.reshape(-1,1)) # print('R2 Score is ',r2_score) # print('Since the Model R2 Score is ',r2_score , ', the model explains ',round(r2_score*100,2) , ' % of the variation in GI') self.data.sort_values(by='Customers' , inplace = True) self.test_data.sort_values(by='Customers' , inplace = True) y_pred = lr.predict(self.data['Customers'].values.reshape(-1, 1)) rms = sqrt(mean_squared_error(self.data['Sales'].values.reshape(-1,1), y_pred)) ty_pred = lr.predict(self.test_data['Customers'].values.reshape(-1, 1)) trms = sqrt(mean_squared_error(self.test_data['Sales'].values.reshape(-1,1), ty_pred)) print('Root Mean Squared Error of Training Set is ',rms) print('Root Mean Squared Error of Testing Set is ',trms) # print('R2 Score of Training Set is ',r2_score(y_pred, self.data['Sales'].values.reshape(-1,1))) # print('R2 Score of Testing Set is ',r2_score(ty_pred, self.test_data['Sales'].values.reshape(-1,1))) plt.figure(figsize=(15,10)) plt.scatter(self.data['Customers'].values.reshape(-1, 1) , self.data['Sales'].values.reshape(-1,1) , color ='r',label = 'Actual Values') plt.scatter(self.data['Customers'].values.reshape(-1, 1) , y_pred , color ='b', label = 'Predicted') plt.plot(self.data['Customers'].values.reshape(-1, 1) , y_pred , color ='k' , label = 'Predicted Line') plt.xlabel('Customers Index') plt.ylabel('Sales Index') plt.legend() plt.savefig('Polynomial Regression Training {}.png'.format(degrees)) plt.figure(figsize=(15,10)) plt.scatter(self.test_data['Customers'].values.reshape(-1, 1) , self.test_data['Sales'].values.reshape(-1,1) , color ='g',label = 'Actual Values') plt.scatter(self.test_data['Customers'].values.reshape(-1, 1) , ty_pred , color ='y', label = 'Predicted') plt.plot(self.test_data['Customers'].values.reshape(-1, 1) , ty_pred , color ='k' , label = 'Predicted Line') plt.xlabel('Customers Index') plt.ylabel('Sales Index') plt.legend() plt.savefig('Polynomial Regression Testing {}.png'.format(degrees)) def return_model(self,reg = 'Ridge' , alpha = 0.01): if reg == 'Ridge': lr = Ridge(alpha=alpha) elif reg =='Lasso': lr = Lasso(alpha=alpha) elif reg =='Elastic': lr = ElasticNet(alpha = alpha) else: lr = Ridge(alpha=alpha , solver = 'cholesky', tol = .005) return lr def Other_Regression(self , reg = 'Ridge'): print('Creating Multiple {} Regression Model... '.format(reg)) print('Using Columns -> ',self.data.drop(columns = ['Sales','Customers']).columns) lr = self.return_model(reg = reg) lr.fit(self.data.drop(columns = ['Sales','Customers']).values , self.data['Sales'].values) print(lr) print('Fitting Done on Model ... ') print('Coefficients for the linear regression problem is ',lr.coef_) print('Intersect Value is ',lr.intercept_) y_pred = lr.predict(self.data.drop(columns = ['Sales','Customers']).values) rms = sqrt(mean_squared_error(self.data['Sales'].values, y_pred)) ty_pred = lr.predict(self.test_data.drop(columns = ['Sales','Customers']).values) trms = sqrt(mean_squared_error(self.test_data['Sales'].values, ty_pred)) print('Root Mean Squared Error of Training Set is ',rms) print('Root Mean Squared Error of Testing Set is ',trms) print('Creating Alpha VS Mean Squared Error Graph for Alpha') alphas = [] train_loss = [] test_loss = [] for i in range(10000): alphas.append(i*0.0015 +0.0001) lr = self.return_model(reg = reg , alpha = (i*0.0015 +0.0001)) lr.fit(self.data.drop(columns = ['Sales','Customers']).values , self.data['Sales'].values) y_pred = lr.predict(self.data.drop(columns = ['Sales','Customers']).values) rms = sqrt(mean_squared_error(self.data['Sales'].values, y_pred)) ty_pred = lr.predict(self.test_data.drop(columns = ['Sales','Customers']).values) trms = sqrt(mean_squared_error(self.test_data['Sales'].values, ty_pred)) train_loss.append(rms) test_loss.append(trms) plt.figure(figsize=(15,10)) plt.plot(alphas , train_loss , color ='r' , label = 'Training Loss') plt.xlabel('Alpha') plt.ylabel('Loss (RMSE)') plt.legend() plt.savefig('{} Regression Alpha Training.png'.format(reg)) plt.figure(figsize=(15,10)) plt.plot(alphas , test_loss , color ='g' , label = 'Testing Loss') plt.xlabel('Alpha') plt.ylabel('Loss (RMSE)') plt.legend() plt.savefig('{} Regression Alpha Testing.png'.format(reg)) print('Using Single Column now ....') lr = self.return_model(reg = reg) lr.fit(self.data['Customers'].values.reshape(-1,1) , self.data['Sales'].values.reshape(-1,1)) print('Fitting Done on Model ... ') print(lr) r2_score = lr.score(self.data['Customers'].values.reshape(-1,1), self.data['Sales'].values.reshape(-1,1)) print('Coefficients for the linear regression problem is ',lr.coef_) print('Intersect Value is ',lr.intercept_) y_pred = lr.predict(self.data['Customers'].values.reshape(-1, 1)) rms = sqrt(mean_squared_error(self.data['Sales'].values.reshape(-1,1), y_pred)) ty_pred = lr.predict(self.test_data['Customers'].values.reshape(-1, 1)) trms = sqrt(mean_squared_error(self.test_data['Sales'].values.reshape(-1,1), ty_pred)) print('Root Mean Squared Error of Training Set is ',rms) print('Root Mean Squared Error of Testing Set is ',trms) plt.figure(figsize=(15,10)) plt.scatter(self.data['Customers'].values.reshape(-1, 1) , self.data['Sales'].values.reshape(-1,1) , color ='r' , label = 'Actual Values') plt.scatter(self.data['Customers'].values.reshape(-1, 1) , y_pred , color ='b' , label = 'Predicted') plt.plot(self.data['Customers'].values.reshape(-1, 1) , y_pred , color ='k' , label = 'Predicted Line') plt.xlabel('Customers Index') plt.ylabel('Sales Index') plt.legend() plt.savefig('{} Regression Training.png'.format(reg)) plt.figure(figsize=(15,10)) plt.scatter(self.test_data['Customers'].values.reshape(-1, 1) , self.test_data['Sales'].values.reshape(-1,1) , color ='g' , label = 'Actual Values') plt.scatter(self.test_data['Customers'].values.reshape(-1, 1) , ty_pred , color ='y' , label = 'Predicted') plt.plot(self.test_data['Customers'].values.reshape(-1, 1) , ty_pred , color ='k' , label = 'Predicted Line') plt.xlabel('Customers Index') plt.ylabel('Sales Index') plt.legend() plt.savefig('{} Regression Testing.png'.format(reg))'",Yes,1,7.0 ross = Rossmann_(),No,5,77.0 "ross.prepare_sample_data(limit =200 , testing_limit = 40) ross.Linear_Regression()",No,4,7.0 "ross.prepare_sample_data(limit =2000 , testing_limit = 400) ross.Mulitple_Linear_Regression()",No,4,7.0 "ross.prepare_sample_data(limit =10000 , testing_limit = 4000) ross.Polynomial_Regression(degrees = 3)",No,4,7.0 "ross.prepare_sample_data(limit =10000 , testing_limit = 4000) ross.Polynomial_Regression(degrees = 2)",No,4,7.0 "ross.prepare_sample_data(limit =1000 , testing_limit = 400) ross.Other_Regression(reg = 'Ridge')",No,4,7.0 "ross.prepare_sample_data(limit =1000 , testing_limit = 400) ross.Other_Regression(reg = 'Lasso')",No,4,7.0 "ross.prepare_sample_data(limit =1000 , testing_limit = 400) ross.Other_Regression(reg = 'Elastic')",No,4,7.0 "ross.prepare_sample_data(limit =1000 , testing_limit = 400) ross.Other_Regression(reg = 'Bridge')",No,4,7.0 "# Get list of categorical variables new= data[data.columns[~data.columns.isin(['Open Date','days','year','month'])]] numerical_features = new.select_dtypes([np.number]).columns.tolist() categorical_features = new.select_dtypes(exclude = [np.number,np.datetime64]).columns.tolist() categorical_features ",No,4,8.0 "from sklearn import preprocessing from sklearn.model_selection import train_test_split",No,5,22.0 "#data = data.drop('Id', axis=1) #test_data = test_data.drop('Id', axis=1) y= data.revenue x_train = data[data.columns[~data.columns.isin(['Open Date','revenue'])]] #train features to be fit in model x_test = test_data[test_data.columns[~test_data.columns.isin(['Open Date'])]] #test features ",No,5,21.0 "from sklearn.preprocessing import LabelEncoder # Processing the categorical columns to provide vector form of feature class DataFrameProcess: def __init__(self,df,col): self.df =df self.col=col def dataEncoding(self): if self.df[self.col].dtype.name == 'object' or self.df[self.col].dtype.name == 'category': le = LabelEncoder() self.df[self.col] = le.fit_transform(self.df[self.col]) def data_transform(df): for col in df.columns: data_prcs = DataFrameProcess(df,col) data_prcs.dataEncoding() data_transform(x_train) data_transform(x_test)",Yes,4,8.0 x_train.head(5),No,5,41.0 "from xgboost import XGBRegressor from sklearn.ensemble import GradientBoostingRegressor gbRegr = GradientBoostingRegressor(max_depth=3, random_state=42) gbRegr.fit(x_train, y) prediction_rr = gbRegr.predict(x_test) ",Yes,4,7.0 "test_label=pd.read_csv('../input/restaurant-revenue-prediction/sampleSubmission.csv') # test target test_label.head(10)",No,4,45.0 "from sklearn.metrics import mean_squared_error from math import sqrt label_list=test_label['Prediction'].tolist()",Yes,5,16.0 "print('Root Mean squared error {}'.format(sqrt(mean_squared_error(label_list, prediction_rr)))) ",No,5,49.0 "from sklearn import ensemble params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.05, 'loss': 'ls'} GBR = ensemble.GradientBoostingRegressor(**params) GBR.fit(x_train, y) preds_GBR = GBR.predict(x_test) GradientBoostingRegressor_RMSE= sqrt(mean_squared_error(label_list, preds_GBR)) print('Root Mean squared error {}'.format(GradientBoostingRegressor_RMSE))",Yes,3,48.0 "parameters = [{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'learning_rate': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0] }] from sklearn.model_selection import GridSearchCV gsearch = GridSearchCV(estimator=XGBRegressor(), param_grid = parameters, scoring='neg_mean_absolute_error', n_jobs=4,cv=3) gsearch.fit(x_train,y) gsearch.best_params_, gsearch.best_score_",Yes,4,7.0 "final_model = XGBRegressor(n_estimators=gsearch.best_params_.get('n_estimators'), learning_rate=gsearch.best_params_.get('learning_rate'), n_jobs=4)",No,5,4.0 "final_model.fit(x_train, y)",No,5,7.0 preds_test = final_model.predict(x_test),No,5,48.0 "from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error from math import sqrt rf_model = RandomForestRegressor(random_state=1) rf_model.fit(x_train, y) rf_val_predictions = rf_model.predict(x_test) RMSE = sqrt(mean_squared_error(label_list,rf_val_predictions)) print(RMSE) ",Yes,4,7.0 "submission = pd.DataFrame({ ""Id"": test_data[""Id""], ""Prediction"": rf_val_predictions }) submission.to_csv('submission.csv',header=True, index=False) print('done')'",No,5,25.0 "b""month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \\\n 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}\nall_data['month_str'] = all_data.Month.map(month2str)\n\ndef check(row):\n if isinstance(row['PromoInterval'],str) and row['month_str'] in row['PromoInterval']:\n return 1\n else:\n return 0\n\nall_data['IsPromoMonth'] = all_data.apply(lambda row: check(row),axis=1) """,No,5,20.0 "tmp = all_data[all_data['part']=='train'] sns.boxplot('IsPromoMonth', 'Sales', data=tmp)",No,5,33.0 "all_data ['isBeforeCompetition'] = all_data.apply(lambda x: 1 if x['Year'] < x['CompetitionOpenSinceYear'] else 0, 1)",No,5,14.0 "tmp = all_data[all_data['part']=='train'] sns.boxplot('isBeforeCompetition', 'Sales', data=tmp)",No,5,33.0 "fig, ax = plt.subplots(5, 1, figsize=(15,10)) for p in range (5): i = np.random.choice(data['Store'].unique()) data[data['Store']== i ].plot('Date', 'Sales', ax=ax[p]) ax[p] .set_title(""Store %d"" %i) plt.tight_layout() plt.show()'",No,5,75.0 "fig, ax = plt.subplots (1,5, figsize=(25,4)) sns.boxplot('StoreType', 'Sales','Promo', data=data, ax=ax[0]) sns.boxplot('StoreType', 'Sales', 'SchoolHoliday', data=data, ax=ax[1]) sns.boxplot('StoreType','Sales','Assortment', data=data, ax=ax[2]) sns.boxplot('StoreType', 'Sales', 'StateHoliday', data=data, ax=ax[3]) sns.boxplot('StoreType', 'Sales', 'Promo2', data=data, ax=ax[4]) plt.tight_layout()",No,5,33.0 "grid = sns.FacetGrid(data, col=""StoreType"", row=""Promo"", palette=""tab10"", col_order=""abcd"") grid.map(sns.pointplot, ""Month"", ""Sales"") plt.show()",No,5,33.0 all_data['SalesPerCustomer'] = data['Sales']/data['Customers'],No,5,8.0 "grid = sns.FacetGrid(all_data, col=""StoreType"", row=""Promo"", palette=""tab10"", col_order=""abcd"") grid.map(sns.pointplot, ""Month"", ""SalesPerCustomer"") plt.show()",No,5,33.0 "mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4} all_data.StoreType.replace(mappings, inplace=True) all_data.Assortment.replace(mappings, inplace=True) all_data.StateHoliday.replace(mappings, inplace=True)",No,5,20.0 "def prepareDf (df, submission=False): tests_date = ""2015-06-12"" tmp_data = all_data[all_data['part']=='train'] .copy() if not submission: tmp_data = tmp_data[tmp_data['Date'] 0 else 0) train['PromoOpen'] = train.PromoOpen.apply(lambda x: x if x > 0 else 0) test['CompetitionOpen'] = 12 * (test.Year - test.CompetitionOpenSinceYear) + (test.Month - test.CompetitionOpenSinceMonth) test['PromoOpen'] = 12 * (test.Year - test.Promo2SinceYear) + (test.WeekOfYear - test.Promo2SinceWeek) / 4.0 test['CompetitionOpen'] = test.CompetitionOpen.apply(lambda x: x if x > 0 else 0) test['PromoOpen'] = test.PromoOpen.apply(lambda x: x if x > 0 else 0)",No,5,8.0 "month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'} train['monthStr'] = train.Month.map(month2str) train.loc[train.PromoInterval == 0, 'PromoInterval'] = '' train['IsPromoMonth'] = 0 for interval in train.PromoInterval.unique(): if interval != '': for month in interval.split(','): train.loc[(train.monthStr == month) & (train.PromoInterval == interval), 'IsPromoMonth'] = 1 month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'} test['monthStr'] = test.Month.map(month2str) test.loc[test.PromoInterval == 0, 'PromoInterval'] = '' test['IsPromoMonth'] = 0 for interval in test.PromoInterval.unique(): if interval != '': for month in interval.split(','): test.loc[(test.monthStr == month) & (test.PromoInterval == interval), 'IsPromoMonth'] = 1 ",No,4,20.0 "train.keys(),test.keys()",No,5,40.0 "train.drop(['Date','Customers','Open','PromoInterval','monthStr'],axis=1,inplace =True) test.drop(['Date','Open','PromoInterval','monthStr'],axis=1,inplace =True) # train = train[train.Sales != 0] ho_xtrain = train.drop(['Sales'],axis=1 ) ho_ytrain = train.Sales ho_xtest=test ho_xtest=ho_xtest.sort_values(by=['Id']) # ho_xtest = test.drop(['Sales'],axis=1 ) # ho_ytest = test.Sales",No,4,10.0 "ho_xtest ",No,5,41.0 " ho_xtest.keys() , ho_xtrain.keys()",No,5,40.0 " preprocessed_dataset=ho_xtrain.to_numpy() #preprocessed_train_labels=np.log1p(ho_ytrain.to_numpy()+1) preprocessed_train_labels=(ho_ytrain.to_numpy()+1)/1000 preprocessed_test_dataset=ho_xtest.to_numpy() # preprocessed_tr_labels=np.log1p(ho_ytest.to_numpy()) # preprocessed_train_labels=ho_ytrain.to_numpy()",No,5,16.0 " preprocessed_test_dataset=ho_xtest.to_numpy() ",No,4,21.0 "from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( preprocessed_dataset, preprocessed_train_labels, test_size=0.2) X_train=np.expand_dims(X_train,axis=-1) X_test=np.expand_dims(X_test,axis=-1) y_train=np.array(y_train) y_test=np.array(y_test) y_train=y_train y_test=y_test print(X_train.shape) y_train.max(),y_test.max() ",No,4,21.0 "def rmspe(y_true, y_pred): ''' RMSPE calculus to use during training phase ''' return K.sqrt(K.mean(K.square(((y_true) - (y_pred) ) / (y_true)), axis=-1)) def rmse(y_true, y_pred): ''' RMSE calculus to use during training phase ''' return K.sqrt(K.mean(K.square(y_pred - y_true))) def rmspe_val(y_true, y_pred): ''' RMSPE calculus to validate evaluation metric about the model ''' return np.sqrt(np.mean(np.square(((y_true) - (y_pred) ) / (y_true)), axis=0))[0] ",No,5,84.0 "from keras.utils.np_utils import to_categorical from keras.models import Model, Sequential, model_from_json from keras.optimizers import SGD, Adam, RMSprop from keras.layers import Input, Dense, Dropout, Flatten, Lambda, Embedding,BatchNormalization,Input,Add,Concatenate from keras.initializers import RandomNormal, Constant, he_normal from keras.callbacks import ModelCheckpoint, EarlyStopping from keras import regularizers from keras.layers import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D,Dense, Dropout, Flatten, Reshape, GlobalAveragePooling1D import keras from keras import backend as K import tensorflow as tf def model1(): initializer = he_normal() dilation_rate=1 bn=BatchNormalization inp=Input(shape=(X_train.shape[1],1)) x1=bn()(Conv1D(50, kernel_size=5, dilation_rate=dilation_rate, activation='relu', padding=""same"",kernel_initializer=initializer)(inp)) x2=bn()(Conv1D(50, kernel_size=5, dilation_rate=dilation_rate, activation='relu', padding=""same"",kernel_initializer=initializer)(x1)) x2= Concatenate()([x1,x2]) x3=bn()(Conv1D(50, kernel_size=5, dilation_rate=dilation_rate, activation='relu', padding=""same"",kernel_initializer=initializer)(x2)) x3=Concatenate()([x1,x2,x3]) x=bn()(Conv1D(50, kernel_size=1, dilation_rate=dilation_rate, activation='relu', padding=""same"",kernel_initializer=initializer)(x3)) x=MaxPooling1D(2)(x) x3=bn()(Conv1D(100, kernel_size=5, dilation_rate=dilation_rate, activation='relu', padding=""same"",kernel_initializer=initializer)(x)) x4=bn()(Conv1D(100, kernel_size=5, dilation_rate=dilation_rate, activation='relu', padding=""same"",kernel_initializer=initializer)(x3)) x4= Concatenate()([x3,x4]) x5=bn()(Conv1D(100, kernel_size=5, dilation_rate=dilation_rate, activation='relu', padding=""same"",kernel_initializer=initializer)(x4)) x=Concatenate()([x3,x4,x5]) x=bn()(Conv1D(100, kernel_size=1, dilation_rate=dilation_rate, activation='relu', padding=""same"",kernel_initializer=initializer)(x)) x=GlobalAveragePooling1D()(x) x=Dense(500, activation=""linear"")(x) y=Dense(1)(x) model= Model(inputs=inp, outputs= y) adam = Adam(lr=1e-3) model.compile(loss=""mae"", optimizer=adam, metrics=[rmspe,""mse"",""mae"",rmse]) # Compile model return model # model_m.compile(loss=""mae"", optimizer=adam, metrics=[rmspe,""mae"",""mse"",rmse]) model_m=model1() print('Build model...') model_m.summary()'",No,3,4.0 " batch_size=80000 nb_epoch=400 print('Fit model...') filepath=""weights_rossmann.best.hdf5"" checkpoint = ModelCheckpoint(filepath, monitor='val_rmspe', verbose=1, save_best_only=True, mode='min') callbacks_list = [checkpoint] log = model_m.fit(X_train, y_train, validation_data=(X_test,y_test), batch_size=batch_size ,epochs=nb_epoch, shuffle=True, callbacks=callbacks_list)'",No,4,7.0 "model_m.load_weights(filepath) preprocessed_test_dataset ypred=model_m.predict(np.expand_dims(preprocessed_test_dataset[:,1:],axis=-1)) # results=np.concatenate([np.expand_dims(preprocessed_test_dataset[:,0],axis=-1),np.expm1(ypred)-1],axis=-1) results=np.concatenate([np.expand_dims(preprocessed_test_dataset[:,0],axis=-1),ypred*1000],axis=-1)",No,3,48.0 "import csv with open('submission.csv', mode='w') as csv_file: fieldnames = ['Id', 'Sales'] writer = csv.DictWriter(csv_file, fieldnames=fieldnames) writer.writeheader() for i in results: print(i) writer.writerow({'Id':i[0], 'Sales': max(0,i[1])}) #writer.writerow({'emp_name': 'John Smith', 'dept': 'Accounting', 'birth_month': 'November'}) #writer.writerow({'emp_name': 'Erica Meyers', 'dept': 'IT', 'birth_month': 'March'}) ",No,5,25.0 "for a,b in zip(y_test,X_test): if a==0: print(a,b[0],b[1],b[2],b[3],b[4],b[5],b[6],b[7]) #Store, DayOfWeek ,int(Date[0]),int(Date[1]),int(Date[2]), Open, Promo,StateHoliday, SchoolHoliday",No,5,53.0 !ls,No,3,88.0 "import pandas as pd from pandas import datetime import matplotlib.pyplot as plt import seaborn as sns import numpy as np",No,5,22.0 "df = pd.read_csv('../input/rossmann-store-sales/train.csv', parse_dates = ['Date'], low_memory = False) df.head()",No,4,45.0 "df['Date']=pd.to_datetime(df['Date'],format='%Y-%m-%d')",No,5,16.0 "df['Hour'] = df['Date'].dt.hour df['Day_of_Month'] = df['Date'].dt.day df['Day_of_Week'] = df['Date'].dt.dayofweek df['Month'] = df['Date'].dt.month",No,5,8.0 "print(df['Date'].min()) print(df['Date'].max())",No,5,40.0 "test = pd.read_csv('../input/rossmann-store-sales/test.csv', parse_dates = True, low_memory = False) test.head()",No,4,45.0 "test['Date']=pd.to_datetime(test['Date'],format='%Y-%m-%d')",No,5,16.0 "test['Hour'] = test['Date'].dt.hour test['Day_of_Month'] = test['Date'].dt.day test['Day_of_Week'] = test['Date'].dt.dayofweek test['Month'] = test['Date'].dt.month",No,5,8.0 "print(test['Date'].min()) print(test['Date'].max())",No,5,40.0 "sns.pointplot(x='Month', y='Sales', data=df)",No,5,81.0 "sns.pointplot(x='Day_of_Week', y='Sales', data=df)",No,5,75.0 "sns.countplot(x = 'Day_of_Week', hue = 'Open', data = df) plt.title('Store Daily Open Countplot')",No,5,75.0 "sns.pointplot(x='Day_of_Month', y='Sales', data=df)",No,5,75.0 "df['SalesPerCustomer'] = df['Sales']/df['Customers'] df['SalesPerCustomer'].describe()",No,4,40.0 df.Open.value_counts(),No,5,72.0 np.sum([df['Sales'] == 0]),No,5,72.0 "#drop closed stores and stores with zero sales df = df[(df[""Open""] != 0) & (df['Sales'] != 0)]'",No,5,14.0 "store = pd.read_csv('../input/rossmann-store-sales/store.csv') store.head(30)",No,4,45.0 store.isnull().sum(),No,5,39.0 "store['CompetitionDistance'] = store['CompetitionDistance'].fillna(store['CompetitionDistance'].max()) store['CompetitionOpenSinceMonth'] = store['CompetitionOpenSinceMonth'].fillna(store['CompetitionOpenSinceMonth'].mode().iloc[0]) #try 0 store['CompetitionOpenSinceYear'] = store['CompetitionOpenSinceYear'].fillna(store['CompetitionOpenSinceYear'].mode().iloc[0]) #try 0 store['Promo2SinceWeek'] = store['Promo2SinceWeek'].fillna(0) #try 0 store['Promo2SinceYear'] = store['Promo2SinceYear'].fillna(store['Promo2SinceYear'].mode().iloc[0]) #try 0 store['PromoInterval'] = store['PromoInterval'].fillna(store['PromoInterval'].mode().iloc[0]) #try 0 store.head()",No,5,17.0 "df_store = pd.merge(df, store, how = 'left', on = 'Store') df_store.head()",No,4,32.0 df_store.groupby('StoreType')['Sales'].describe(),No,5,60.0 "df_store.groupby('StoreType')['Customers', 'Sales'].sum()",No,5,60.0 "#sales trends sns.catplot(data = df_store, x = 'Month', y = ""Sales"", col = 'StoreType', # per store type in cols palette = 'plasma', hue = 'StoreType', row = 'Promo', # per promo in the store in rows color = 'c') '",No,5,75.0 "#customer trends sns.catplot(data = df_store, x = 'Month', y = ""Customers"", col = 'StoreType', # per store type in cols palette = 'plasma', hue = 'StoreType', row = 'Promo', # per promo in the store in rows color = 'c')'",No,5,75.0 "#sales per customer sns.catplot(data = df_store, x = 'Month', y = ""SalesPerCustomer"", col = 'StoreType', # per store type in cols palette = 'plasma', hue = 'StoreType', row = 'Promo', # per promo in the store in rows color = 'c')'",No,5,75.0 "sns.catplot(data = df_store, x = 'Month', y = ""Sales"", col = 'DayOfWeek', # per store type in cols palette = 'plasma', hue = 'StoreType', row = 'StoreType', # per store type in rows color = 'c') '",No,5,75.0 "#stores open on sunday df_store[(df_store.Open == 1) & (df_store.DayOfWeek == 7)]['Store'].unique()",No,5,57.0 "sns.catplot(data = df_store, x = 'DayOfWeek', y = ""Sales"", col = 'Promo', row = 'Promo2', hue = 'Promo2', palette = 'RdPu') '",No,5,75.0 "df_store['StateHoliday'] = df_store['StateHoliday'].map({'0':0 , 0:0 , 'a':1 , 'b':2 , 'c':3}) df_store['StateHoliday'] = df_store['StateHoliday'].astype(int)",No,4,20.0 "df_store['StoreType'] = df_store['StoreType'].map({'a':1 , 'b':2 , 'c':3 , 'd':4}) df_store['StoreType'] = df_store['StoreType'].astype(int)",No,4,20.0 df_store.isnull().sum(),No,5,39.0 "df_store['Assortment'] = df_store['Assortment'].map({'a':1 , 'b':2 , 'c':3}) df_store['Assortment'] = df_store['Assortment'].astype(int)",No,4,20.0 "df_store['PromoInterval'] = df_store['PromoInterval'].map({'Jan,Apr,Jul,Oct':1 , 'Feb,May,Aug,Nov':2 , 'Mar,Jun,Sept,Dec':3}) df_store['PromoInterval'] = df_store['PromoInterval'].astype(int)",No,4,20.0 "df_store.to_csv('df_merged.csv', index=False)",No,5,25.0 len(df_store),No,5,58.0 "test = pd.merge(test, store, how = 'left', on = 'Store') test.head()",No,4,32.0 "test.fillna(method='ffill', inplace=True)",No,5,17.0 "test['StateHoliday'] = test['StateHoliday'].map({'0':0 , 0:0 , 'a':1 , 'b':2 , 'c':3}) test['StateHoliday'] = test['StateHoliday'].astype(int) test['StoreType'] = test['StoreType'].map({'a':1 , 'b':2 , 'c':3 , 'd':4}) test['StoreType'] = test['StoreType'].astype(int) test['Assortment'] = test['Assortment'].map({'a':1 , 'b':2 , 'c':3}) test['Assortment'] = test['Assortment'].astype(int) test['PromoInterval'] = test['PromoInterval'].map({'Jan,Apr,Jul,Oct':1 , 'Feb,May,Aug,Nov':2 , 'Mar,Jun,Sept,Dec':3}) test['PromoInterval'] = test['PromoInterval'].astype(int)",No,4,20.0 "test.to_csv('test_merged.csv', index=False)",No,5,25.0 "test = test.drop(['Id','Date'],axis=1)",No,5,10.0 "X = df_store.drop(['Date','Sales','Customers', 'SalesPerCustomer'],1) #Transform Target Variable y = np.log1p(df_store['Sales']) from sklearn.model_selection import train_test_split X_train , X_val , y_train , y_val = train_test_split(X, y , test_size=0.30 , random_state = 1 )",No,3,21.0 "X_train.shape, X_val.shape, y_train.shape, y_val.shape",No,5,58.0 "from sklearn.ensemble import GradientBoostingRegressor gbrt = GradientBoostingRegressor(max_depth=10, n_estimators=200, random_state=42) gbrt.fit(X_train, y_train) print(gbrt.score(X_train, y_train))",Yes,3,7.0 y_pred = gbrt.predict(X_val),No,5,48.0 "from sklearn.metrics import r2_score, mean_squared_error print(r2_score(y_val , y_pred)) print(np.sqrt(mean_squared_error(y_val , y_pred)))",No,5,49.0 "df1 = pd.DataFrame({'Actual': y_val, 'Predicted': y_pred}) df1.head(25)",No,4,12.0 "test_pred=gbrt.predict(test[X.columns]) test_pred_inv=np.exp(test_pred)-1",No,5,48.0 test_pred_inv,No,5,41.0 "#make submission df prediction = pd.DataFrame(test_pred_inv) submission = pd.read_csv('../input/rossmann-store-sales/sample_submission.csv') prediction_df = pd.concat([submission['Id'], prediction], axis=1) prediction_df.columns=['Id','Sales'] prediction_df.to_csv('Sample_Submission.csv', index=False)",No,3,25.0 prediction_df.head(),No,5,41.0 "# import pandas as pd import numpy as np import xgboost as xgb import missingno as msno import seaborn as sns import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline'",No,5,23.0 "b""# \ntrain = pd.read_csv('../input/rossmann-store-sales/train.csv')\ntest = pd.read_csv('../input/rossmann-store-sales/test.csv')\nstore = pd.read_csv('../input/rossmann-store-sales/store.csv')""",No,5,45.0 "train.info(), test.info(), store.info()",No,5,40.0 "fig = plt.figure(figsize=(16,6)) ax1 = fig.add_subplot(121) ax1.set_xlabel('Sales') ax1.set_ylabel('Count') ax1.set_title('Sales of Closed Stores') plt.xlim(-1,1) train.loc[train.Open==0].Sales.hist(align='left') ax2 = fig.add_subplot(122) ax2.set_xlabel('Sales') ax2.set_ylabel('PDF') ax2.set_title('Sales of Open Stores') sns.distplot(train.loc[train.Open!=0].Sales) print('The skewness of Sales is {}'.format(train.loc[train.Open!=0].Sales.skew()))",No,5,33.0 "train = train.loc[train.Open != 0] train = train.loc[train.Sales > 0].reset_index(drop=True)",No,5,14.0 "# train train[train.isnull().values==True]'",No,5,14.0 "# test test[test.isnull().values==True]'",No,5,14.0 "# store msno.matrix(store)'",No,5,34.0 "# test test.fillna(1,inplace=True) # CompetitionDistance store.CompetitionDistance = store.CompetitionDistance.fillna(store.CompetitionDistance.median()) # 0 store.fillna(0,inplace=True)'",No,5,17.0 "b""# \ntrain = pd.merge(train, store, on='Store')\ntest = pd.merge(test, store, on='Store')""",No,5,32.0 "import statsmodels.api as sm from statsmodels.sandbox.regression.predstd import wls_prediction_std import math import sklearn.preprocessing as skpe import sklearn.model_selection as ms import sklearn.metrics as sklm import sklearn.ensemble as sken import sklearn.linear_model as lm import seaborn as sns import matplotlib.pyplot as plt",No,5,22.0 "# Reading files path=""../input/rossmann-store-sales/train.csv"" train=pd.read_csv(path) print(train.shape) train.head()",Yes,4,45.0 "path1=""../input/rossmann-store-sales/test.csv"" test=pd.read_csv(path1) print(test.shape) test.head()",Yes,4,45.0 "path2=""../input/rossmann-store-sales/store.csv"" store_df=pd.read_csv(path2) print(store_df.shape) store_df.head()",Yes,4,45.0 "train.info() print(""----------------------------------------------"") store_df.info() print(""----------------------------------------------"") test.info()",No,5,40.0 "# Adding new variable train['Sales_per_customer']=train['Sales']/train['Customers'] train['Sales_per_customer'].describe() # An average of 9.49$ is earned from a customer at a particular store",No,4,8.0 "fig, ax1 = plt.subplots(figsize=(15,4)) sns.countplot(x='Open',hue='DayOfWeek', data=train,palette=""husl"", ax=ax1) # This indicates that there are some stores which opens mostly on Sundays while some are closed on Sundays '",No,5,33.0 "# Date # Create Year and Month columns train['Year'] = train['Date'].apply(lambda x: int(str(x)[:4])) train['Month'] = train['Date'].apply(lambda x: int(str(x)[5:7])) test['Year'] = test['Date'].apply(lambda x: int(str(x)[:4])) test['Month'] = test['Date'].apply(lambda x: int(str(x)[5:7])) # Assign Date column to Date(Year-Month) instead of (Year-Month-Day) train['Date'] = train['Date'].apply(lambda x: (str(x)[:7])) test['Date'] = test['Date'].apply(lambda x: (str(x)[:7])) # group by date and get average sales, and percent change avg_sales = train.groupby('Date')[""Sales""].mean() pct_change_sales = train.groupby('Date')[""Sales""].sum().pct_change() fig, (axis1,axis2) = plt.subplots(2,1,sharex=True,figsize=(15,8)) # plot average sales over time(year-month) ax1 = avg_sales.plot(legend=True,ax=axis1,marker='o',title=""Average Sales"") ax1.set_xticks(range(len(avg_sales))) ax1.set_xticklabels(avg_sales.index.tolist(), rotation=90) # plot precent change for sales over time(year-month) ax2 = pct_change_sales.plot(legend=True,ax=axis2,marker='o',rot=90,colormap=""summer"",title=""Sales Percent Change"")'",Yes,4,8.0 "# Plot average sales and customers over years fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4)) sns.barplot(x='Year', y='Sales', data=train, ax=axis1) sns.barplot(x='Year', y='Customers', data=train, ax=axis2)",No,5,75.0 "# Plot average sales and customers over days of week fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4)) sns.barplot(x='DayOfWeek', y='Sales', data=train, ax=axis1) sns.barplot(x='DayOfWeek', y='Customers', data=train, ax=axis2) ",No,5,75.0 "# Plot average sales and customers over months fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4)) sns.barplot(x='Month', y='Sales', data=train, ax=axis1) sns.barplot(x='Month', y='Customers', data=train, ax=axis2)",No,5,75.0 "# Plot average sales and customers with/without promo fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4)) sns.barplot(x='Promo', y='Sales', data=train, ax=axis1) sns.barplot(x='Promo', y='Customers', data=train, ax=axis2)",No,5,33.0 "b""def build_features(features, data):\n\n # \n features.extend(['Store','CompetitionDistance','CompetitionOpenSinceMonth','StateHoliday','StoreType','Assortment',\n 'SchoolHoliday','CompetitionOpenSinceYear', 'Promo', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear'])\n \n # https://blog.csdn.net/aicanghai_smile/article/details/80987666\n \n # dt\n features.extend(['Year','Month','Day','DayOfWeek','WeekOfYear'])\n data['Year'] = data.Date.dt.year\n data['Month'] = data.Date.dt.month\n data['Day'] = data.Date.dt.day\n data['DayOfWeek'] = data.Date.dt.dayofweek\n data['WeekOfYear'] = data.Date.dt.weekofyear\n \n # 'CompetitionOpen'\n # 'PromoOpen'\n # \n features.extend(['CompetitionOpen','PromoOpen'])\n data['CompetitionOpen'] = 12*(data.Year-data.CompetitionOpenSinceYear) + (data.Month-data.CompetitionOpenSinceMonth)\n data['PromoOpen'] = 12*(data.Year-data.Promo2SinceYear) + (data.WeekOfYear-data.Promo2SinceWeek)/4.0\n data['CompetitionOpen'] = data.CompetitionOpen.apply(lambda x: x if x > 0 else 0) \n data['PromoOpen'] = data.PromoOpen.apply(lambda x: x if x > 0 else 0)\n \n # 'IsPromoMonth'10\n features.append('IsPromoMonth')\n month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}\n data['monthStr'] = data.Month.map(month2str)\n data.loc[data.PromoInterval==0, 'PromoInterval'] = ''\n data['IsPromoMonth'] = 0\n for interval in data.PromoInterval.unique():\n if interval != '':\n for month in interval.split(','):\n data.loc[(data.monthStr == month) & (data.PromoInterval == interval), 'IsPromoMonth'] = 1\n \n # \n mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}\n data.StoreType.replace(mappings, inplace=True)\n data.Assortment.replace(mappings, inplace=True)\n data.StateHoliday.replace(mappings, inplace=True)\n data['StoreType'] = data['StoreType'].astype(int)\n data['Assortment'] = data['Assortment'].astype(int)\n data['StateHoliday'] = data['StateHoliday'].astype(int)""",No,3,20.0 "b""# Date\ntrain.Date = pd.to_datetime(train.Date, errors='coerce')\ntest.Date = pd.to_datetime(test.Date, errors='coerce')\n\n# features\nfeatures = []\n\n# traintest\nbuild_features(features, train)\nbuild_features([], test)\n\n# \nprint(features)""",No,3,16.0 "# Rmspe # https://www.kaggle.com/justdoit/xgboost-in-python-with-rmspe def ToWeight(y): w = np.zeros(y.shape, dtype=float) ind = y != 0 w[ind] = 1./(y[ind]**2) return w def rmspe(yhat, y): w = ToWeight(y) rmspe = np.sqrt(np.mean(w * (y-yhat)**2)) return rmspe def rmspe_xg(yhat, y): y = y.get_label() y = np.expm1(y) yhat = np.expm1(yhat) w = ToWeight(y) rmspe = np.sqrt(np.mean(w * (y-yhat)**2)) return ""rmspe"", rmspe def neg_rmspe(yhat, y): y = np.expm1(y) yhat = np.expm1(yhat) w = ToWeight(y) rmspe = np.sqrt(np.mean(w * (y-yhat)**2)) return -rmspe'",No,5,84.0 "from sklearn.model_selection import GridSearchCV, ShuffleSplit from sklearn.metrics import make_scorer from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(random_state=2) cv_sets = ShuffleSplit(n_splits=5, test_size=0.2) params = {'max_depth':range(10,40,2)} scoring_fnc = make_scorer(neg_rmspe) grid = GridSearchCV(regressor,params,scoring_fnc,cv=cv_sets) grid = grid.fit(train[features], np.log1p(train.Sales)) DTR = grid.best_estimator_",No,4,6.0 "# DTR.get_params()'",No,5,79.0 "# submission = pd.DataFrame({""Id"": test[""Id""], ""Sales"": np.expm1(DTR.predict(test[features]))}) submission.to_csv(""benchmark.csv"", index=False)'",No,5,25.0 "b""# \nparams = {'objective': 'reg:linear',\n 'eta': 0.01,\n 'max_depth': 11,\n 'subsample': 0.5,\n 'colsample_bytree': 0.5,\n 'silent': 1,\n 'seed': 1\n }\nnum_trees = 10000""",No,5,59.0 "b""# \nfrom sklearn.model_selection import train_test_split\n\nX_train, X_test = train_test_split(train, test_size=0.2, random_state=2)\n\ndtrain = xgb.DMatrix(X_train[features], np.log1p(X_train.Sales))\ndvalid = xgb.DMatrix(X_test[features], np.log1p(X_test.Sales))\ndtest = xgb.DMatrix(test[features])\n\nwatchlist = [(dtrain, 'train'),(dvalid, 'eval')]\ngbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=50, feval=rmspe_xg, verbose_eval=False)""",No,2,7.0 "# test_probs = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_ntree_limit) indices = test_probs < 0 test_probs[indices] = 0 submission = pd.DataFrame({""Id"": test[""Id""], ""Sales"": np.expm1(test_probs)}) submission.to_csv(""xgboost.csv"", index=False)'",Yes,2,48.0 "from fastai.tabular import * from isoweek import Week #import tarfile for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # path to external datasets tar = tarfile.open('/kaggle/input/external-datasets/rossmann.tgz', ""r:gz"")'",No,4,88.0 "# place holders path = ""/kaggle/input/rossmann-store-sales/"" base_path=""../output""",No,3,44.0 "# paths to kaggle data sets train = ""/kaggle/input/rossmann-store-sales/train.csv"" test = ""/kaggle/input/rossmann-store-sales/test.csv"" store = ""/kaggle/input/rossmann-store-sales/store.csv"" # paths to external tar file datasets store_states = tar.extractfile('store_states.csv') state_names = tar.extractfile('state_names.csv') googletrend = tar.extractfile('googletrend.csv') weather = tar.extractfile('weather.csv')'",No,4,45.0 "# read in kaggle and external datasets as dataframes table_names = [train, store, store_states, state_names, googletrend, weather, test] tables = [pd.read_csv(fpath, low_memory=False) for fpath in table_names] train, store, store_states, state_names, googletrend, weather, test = tables len(train),len(test)",No,4,45.0 "print(train.shape) train.head()",No,4,58.0 "print(test.shape) test.head()",No,4,58.0 "print(store.shape) store.head()",No,4,58.0 "print(store_states.shape) store_states.head()",No,4,58.0 "print(googletrend.shape) googletrend.head()",No,4,58.0 "print(weather.shape) weather.head()",No,4,58.0 "print(train.StateHoliday.unique()) print(test.StateHoliday.unique())",No,5,57.0 "train.StateHoliday = train.StateHoliday!='0' test.StateHoliday = test.StateHoliday!='0'",No,5,14.0 "def join_df(left, right, left_on, right_on=None, suffix='_y'): if right_on is None: right_on = left_on return left.merge(right, how='left', left_on=left_on, right_on=right_on, suffixes=("""",suffix))'",No,5,32.0 "weather = join_df(weather, state_names, ""file"", ""StateName"") weather.head(3)",No,4,32.0 "googletrend['Date'] = googletrend.week.str.split(' - ', expand=True)[0] googletrend['State'] = googletrend.file.str.split('_', expand=True)[2] googletrend.loc[googletrend.State=='NI', ""State""] = 'HB,NI''",No,4,78.0 googletrend.head(3),No,5,41.0 "def add_datepart(df, fldname, drop=True, time=False): ""Helper function that adds columns relevant to a date."" fld = df[fldname] fld_dtype = fld.dtype if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype): fld_dtype = np.datetime64 if not np.issubdtype(fld_dtype, np.datetime64): df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True) targ_pre = re.sub('[Dd]ate$', '', fldname) attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'] if time: attr = attr + ['Hour', 'Minute', 'Second'] for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower()) df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9 if drop: df.drop(fldname, axis=1, inplace=True)'",No,2,8.0 "add_datepart(googletrend,""Date"", drop=False) googletrend.head(3)",No,2,16.0 "# continue with all other tables add_datepart(weather, ""Date"", drop=False) add_datepart(train, ""Date"", drop=False) add_datepart(test, ""Date"", drop=False)",No,3,16.0 "trend_de = googletrend[googletrend.file == 'Rossmann_DE'] trend_de.head(3)",No,4,14.0 "store = join_df(store, store_states, ""Store"") len(store[store.State.isnull()])",No,4,32.0 "joined = join_df(train, store, ""Store"") joined_test = join_df(test, store, ""Store"") len(joined[joined.StoreType.isnull()]), len(joined_test[joined_test.StoreType.isnull()])",No,4,32.0 "# join the joined df with googletrend with [""State"",""Year"",""Week""] as the index # this way the non matching day dates do not create issues. joined = join_df(joined, googletrend, [""State"",""Year"", ""Week""]) joined_test = join_df(joined_test, googletrend, [""State"",""Year"", ""Week""]) len(joined[joined.trend.isnull()]),len(joined_test[joined_test.trend.isnull()])",No,4,32.0 "# now join the overal germany trend joined = joined.merge(trend_de, 'left', [""Year"", ""Week""], suffixes=('', '_DE')) joined_test = joined_test.merge(trend_de, 'left', [""Year"", ""Week""], suffixes=('', '_DE')) len(joined[joined.trend_DE.isnull()]),len(joined_test[joined_test.trend_DE.isnull()])'",No,4,32.0 "# finally join the weather data joined = join_df(joined, weather, [""State"",""Date""]) joined_test = join_df(joined_test, weather, [""State"",""Date""]) len(joined[joined.Mean_TemperatureC.isnull()]),len(joined_test[joined_test.Mean_TemperatureC.isnull()])",No,4,32.0 "# now we can drop duplicated columns for df in (joined, joined_test): for c in df.columns: if c.endswith('_y'): if c in df.columns: df.drop(c, inplace=True, axis=1)",No,5,10.0 "for df in (joined,joined_test): df['CompetitionOpenSinceYear'] = df.CompetitionOpenSinceYear.fillna(1900).astype(np.int32) df['CompetitionOpenSinceMonth'] = df.CompetitionOpenSinceMonth.fillna(1).astype(np.int32) df['Promo2SinceYear'] = df.Promo2SinceYear.fillna(1900).astype(np.int32) df['Promo2SinceWeek'] = df.Promo2SinceWeek.fillna(1).astype(np.int32)",No,5,16.0 "for df in (joined, joined_test): df['CompetitionOpenSince'] = pd.to_datetime(dict(year=df.CompetitionOpenSinceYear, month=df.CompetitionOpenSinceMonth, day=15)) df['CompetitionDaysOpen'] = df.Date.subtract(df.CompetitionOpenSince).dt.days",No,4,16.0 "for df in (joined, joined_test): df.loc[df.CompetitionDaysOpen<0, ""CompetitionDaysOpen""] = 0 df.loc[df.CompetitionOpenSinceYear<1990, ""CompetitionDaysOpen""] = 0",No,4,20.0 "for df in (joined,joined_test): df[""CompetitionMonthsOpen""] = df[""CompetitionDaysOpen""]//30 df.loc[df.CompetitionMonthsOpen>24, ""CompetitionMonthsOpen""] = 24 joined.CompetitionMonthsOpen.unique()",No,4,8.0 "for df in (joined, joined_test): df[""Promo2Since""] = pd.to_datetime(df.apply( lambda x: Week(x.Promo2SinceYear, x.Promo2SinceWeek).monday(), axis=1)) df[""Promo2Days""] = df.Date.subtract(df['Promo2Since']).dt.days'",No,4,16.0 "for df in (joined,joined_test): df.loc[df.Promo2Days<0, ""Promo2Days""] = 0 df.loc[df.Promo2SinceYear<1990, ""Promo2Days""] = 0 df[""Promo2Weeks""] = df[""Promo2Days""]//7 df.loc[df.Promo2Weeks<0, ""Promo2Weeks""] = 0 df.loc[df.Promo2Weeks>25, ""Promo2Weeks""] = 25 df.Promo2Weeks.unique()",No,4,8.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename))",No,5,88.0 "b""# This decoratore shows some info about function perormance\n# etc time,shpe changes,nan values\n\ndef info(function):\n import datetime\n def wrapper(data,*args,**kargs):\n tic = datetime.datetime.now()\n result = function(data,*args,**kargs)\n toc = datetime.datetime.now()\n print(function.__name__,' took ', toc-tic)\n print('Shape: ',data.shape,' ----> ', result.shape)\n print('NaN value: ', result.isna().sum()[result.isna().sum() != 0])\n print('\\n')\n return result\n return wrapper""",No,5,53.0 "# let`s load datasets as usually train = pd.read_csv('/kaggle/input/rossmann-store-sales/train.csv') test = pd.read_csv('/kaggle/input/rossmann-store-sales/test.csv') stores = pd.read_csv('/kaggle/input/rossmann-store-sales/store.csv') sample = pd.read_csv('/kaggle/input/rossmann-store-sales/sample_submission.csv')",No,5,45.0 "# Thank to notebooks we can definve evaluation metric def ToWeight(y): w = np.zeros(y.shape, dtype=float) ind = y != 0 w[ind] = 1./(y[ind]**2) return w def rmspe(yhat, y): w = ToWeight(y) rmspe = np.sqrt(np.mean( w * (y - yhat)**2 )) return rmspe",No,5,84.0 "import seaborn as sns fig,ax =plt.subplots(1,2,figsize = (20,10)) ins1 = ax[0].inset_axes([0.5,0.5,0.4,0.4]) ins2 = ax[1].inset_axes([0.7,0.7,0.2,0.2]) sns.distplot(train[train.Sales != 0].Sales,ax=ax[0],bins=100) sns.distplot(np.log1p(train[train.Sales != 0].Sales),ax=ins1,bins=100,color = 'red') sns.boxplot(train[train.Sales != 0].Sales,ax=ax[1]) sns.boxplot(np.log1p(train[train.Sales != 0].Sales),ax=ins2) # We see that sales values shpw positive skeew, it can be fixed by applying np.log1p (embedded plot) # Also there are some outliers, lets define functions to perform transformation and outliers removal",No,5,33.0 "@info def log_transf(df): # log transformation function to remove skeew df.Sales = np.log1p(df.Sales) df.Customers = np.log1p(df.Customers) return df @info def remove_outliers(df,column='Sales'): # interquntile approach to remove outliers q1 = df[column].quantile(0.2) q3 = df[column].quantile(0.8) iqr = q3-q1 iqr_lower = q1 - 1.5*iqr iqr_upper = q3 + 1.5*iqr df = df.loc[(df[column] > iqr_lower) & (df[column]< iqr_upper),:] return df",No,3,8.0 "@info def timeseries_features(df): # move to datetime format df.Date = pd.to_datetime(df.Date) df = df.sort_values('Date').reset_index(drop = True) # derive regular for ml task time series features df['month'] = df.Date.dt.month df['dayofmonth'] = df.Date.dt.day df['dayofyear'] = df.Date.dt.dayofyear df['year'] = df.Date.dt.year df['is_weekday'] = df.DayOfWeek.apply(lambda x: 0 if x in (6,7) else 1) df['is_month_start'] = df.Date.dt.is_month_start.astype(int) df['is_month_end'] = df.Date.dt.is_month_end.astype(int) # also lets take into account holidays from pandas.tseries.holiday import USFederalHolidayCalendar as calendar holidays = calendar().holidays(start = df.Date.min(), end = df.Date.max()) df['is_holiday'] = df.Date.isin(holidays).astype(int) return df @info def clean_main(df): # drop days with 0 sales df = df.loc[df.Sales != 0,:].reset_index(drop = True) df = df.drop(['Open'],axis = 1) # beacus unique values contain mixed dtype array(['a', '0', 'b', 'c', 0], dtype=object) # also could be fixed during pandas importing df.StateHoliday = df.StateHoliday.astype(str) return df @info def clean_store(df): # lets drop columns with high content of nan values df.CompetitionDistance.fillna(df.CompetitionDistance.mean(),inplace = True) df.drop(['CompetitionOpenSinceMonth','CompetitionOpenSinceYear','Promo2SinceWeek','Promo2SinceYear'],axis = 1,inplace = True) import calendar # We have a list of promo monthes and we can derive usefull feature # presence or absence of promo # first create encoded dictionary Month:Number eg Feb:2 month_dict = {v: k for k,v in enumerate(calendar.month_abbr)} del month_dict[''] del month_dict['Sep'] month_dict['NaN'] = 0 # assign absence of promo 0 month_dict['Sept'] = 9 # There is no Sep # Secondly, we treat PromoInterval columns, making each row list instead of string now we have smth like ['Feb','Mar','Sept'] # and lets apply dictionary df.PromoInterval = df.PromoInterval.fillna('NaN') df.PromoInterval = df.PromoInterval.str.split(',') # Lastly we are applyin transformation df.PromoInterval = df.PromoInterval.apply(lambda x: [month_dict[value] for value in x if month_dict.get(value)]) # Lets create new feature that us equal to number of promo monthes df['promo_len'] = df.PromoInterval.apply(lambda x: len(x)) return df # Pipeline for train file train_prep = (train .copy() .pipe(log_transf) .pipe(remove_outliers) .pipe(timeseries_features) .pipe(clean_main) ) # Pipeline for store file store_prep = (stores .copy() .pipe(clean_store) ) # Now we merge two data_prep = pd.merge(train_prep,store_prep,how='left',on='Store') # Using our transformation in PromoInterval interval, we create binary new feature is_promo or not data_prep['is_promo'] = data_prep.apply(lambda x: 1 if x['month'] in x['PromoInterval'] else 0,axis = 1) data_prep = data_prep.drop('PromoInterval',axis=1).reset_index(drop=True)",No,2,8.0 "# Here I would like to know what is rmspe score with th emost dumb approach # we devide data on train and test test_bs = data_prep[data_prep.year == 2015] train_bs = data_prep[data_prep.year < 2015] # I am going to use mean Sales grouped by store-month-day among previous years as predicted values for 2015 predict_bs = (train_bs .groupby(['Store','month','dayofmonth']).Sales.mean().reset_index().rename({'Sales':'predictions'},axis = 1) .merge(test_bs,how='right',on = ['Store','month','dayofmonth']) .fillna(train_bs.Sales.mean()) .sort_values('Date') ) # Display baseline print('Baseline to overcome = {:.2f}'.format(rmspe(np.expm1(predict_bs.Sales),np.expm1(predict_bs.predictions)))) # Let`s see how prediction looks like fig,ax = plt.subplots(1,3,figsize = (30,10)) rnd_store = np.random.randint(min(predict_bs.Store),max(predict_bs.Store),3) for idx,store in enumerate(rnd_store): ax[idx].plot(predict_bs[predict_bs.Store == store].Date,np.expm1(predict_bs[predict_bs.Store == store].Sales), color = 'blue' ,label = 'Observed') ax[idx].plot(predict_bs[predict_bs.Store == store].Date,np.expm1(predict_bs[predict_bs.Store == store].predictions),color = 'red',label = 'Predicted') ax[idx].legend() ax[idx].set_title('Store '+str(store)) # It doesn`t look so bad",Yes,3,33.0 "# There are two few reasons to use mean (aka target) encoding # We have 1115 stores, definetly there is correlation between store and sales # We could perform leave stores as it is ----> not good for known reasons # We could perform OneHotEncoding ----> not goodm becaouse we will have 1115 new columns, mainly sparse # We can do mean encoding, eg encode stores as mean/std/other of target # I am going to use Customers to encode store, because we don`t have customers in test set # Obviusly customers can be good feature def mean_encoding(df,column,target,func = np.mean): # perform target encoding on column with some function enc_col_name = target+'_enc_'+func.__name__ df_temp = (df .groupby(column)[target] .apply(func) .reset_index() .rename({target:enc_col_name},axis=1) ) df = df.merge(df_temp,how='left',on = column) return df,df_temp data_prep,dict_for_test = mean_encoding(data_prep,'Store','Customers',func = np.mean) ",No,1,16.0 "b""# also it is good to statistic\n\nfrom statsmodels.tsa.seasonal import seasonal_decompose\nfrom statsmodels.graphics.tsaplots import plot_acf,plot_pacf\nfrom statsmodels.tsa.stattools import adfuller\n\n\n# first lets check our data for stationarity\n\ncounter = 0\nfor store in data_prep.Store.unique():\n df_store = data_prep.copy().loc[data_prep.Store == store,['Date','Sales']].set_index('Date')\n # since we removed some dates, lets resample data on a daily basis and fillna with 0\n df_store = df_store.resample('D').fillna('bfill')\n adf = adfuller(df_store,regression='c', autolag='AIC')\n \n if adf[1] > 0.05:\n print('Adfuller for store {} : p-value = {:.5f} > 5% -----> NON STATIONARY'.format(store,adf[1]*100))\n counter+=1\n # also we can use it as a feature\n # Doesnt make sense becaause only ~3 of store are not statonary\n \nprint('\\n {:.2f} % of stores are non stationary '.format(counter/len(data_prep.Store.unique())*100))\n\n# There is a chance to use traditional time series technique(ARIMA,SARIMAX, smothing) but i would ike to continue with ml""",Yes,2,22.0 "# lets check few random stores rnd_store = np.random.randint(min(data_prep.Store),max(data_prep.Store),3) fig,ax = plt.subplots(3,2,figsize = (15,10)) for idx,store in enumerate(rnd_store): df_store = data_prep.copy().loc[data_prep.Store == store,['Date','Sales']].set_index('Date') df_store = df_store.resample('D').fillna('bfill') plot_acf(df_store,lags = 60,ax = ax[idx,0],label = store) plot_pacf(df_store,lags = 60,ax = ax[idx,1], label = store) ax[idx,0].set_title('Autocorelation for store {}'.format(store)) ax[idx,1].set_title('Partial Autocorelation for store {}'.format(store)) plt.tight_layout() # By running this part few times we can notice that almost for all stores there is hogh corelation with following lags: # 1 14,28,42, 49 # Therefore lets use this values to create new features # But we need to preduct 48 days in future, threre fore we cannot use something lower 48",No,3,33.0 "b""# finally lets check on nan and dublicated values\n \nprint('NaN summary\\n\\n',data_prep.isna().sum()/len(data_prep)*100,'\\n')\nprint('Number of absoulute dublicates:',data_prep.duplicated().sum())\nprint('Number of Store - Date dublicates:',data_prep.duplicated(subset = ['Date','Store']).sum())""",No,3,39.0 "from pandas.plotting import scatter_matrix import seaborn corr = data_prep.corr() plt.figure(figsize=(15,15)) seaborn.heatmap(corr)",No,5,80.0 "stores = np.random.randint(train.Store.min(),train.Store.max(),2) plt.figure(figsize=(15,10)) for store in stores: plt.plot(data_prep.loc[(data_prep.Store == store) & (data_prep.year == 2013),'Date'],data_prep.loc[(data_prep.Store == store) & (data_prep.year == 2013),'Sales'],label = store) plt.legend()",No,5,75.0 "ohe_col = data_prep.select_dtypes('object').columns.tolist()+['Store','DayOfWeek','month'] num_col = data_prep.select_dtypes('float').columns.tolist()",No,2,8.0 "X = data_prep.drop(['Date','Sales','Customers','Store'],axis = 1) y = data_prep.Sales X_train,X_val = X.loc[X.year < 2015,:],X.loc[X.year == 2015,:] y_train,y_val = y[:X_train.index[-1]+1], y[X_train.index[-1]+1:]",No,3,13.0 "from sklearn.preprocessing import StandardScaler,MinMaxScaler from sklearn.preprocessing import OneHotEncoder,LabelEncoder,OrdinalEncoder from sklearn.compose import make_column_transformer from sklearn.pipeline import make_pipeline transformer = make_column_transformer( (StandardScaler(),['CompetitionDistance', 'Customers_enc_mean']), (OneHotEncoder(),['StateHoliday', 'StoreType', 'Assortment', 'DayOfWeek', 'month']), remainder = 'passthrough' ) import xgboost as xgb regressor = xgb.XGBRegressor(n_estimators = 200, max_depth = 10 ) pipeline = make_pipeline(transformer, regressor) pipeline.fit(X_train,y_train) print('TRAIN RMSPE = ',rmspe(np.expm1(pipeline.predict(X_train)),np.expm1(y_train))) print('VAL RMSPE = ',rmspe(np.expm1(pipeline.predict(X_val)),np.expm1(y_val)))",No,3,7.0 "# We need to apply same transformation on test set as we did we train set test_prep = (test .copy() .pipe(timeseries_features) .drop(['Open','Date'],axis=1) ) test_prep = pd.merge(test_prep,store_prep,how='left',on='Store') test_prep['is_promo'] = test_prep.apply(lambda x: 1 if x['month'] in x['PromoInterval'] else 0,axis = 1) test_prep = pd.merge(test_prep,dict_for_test,how='left',on='Store') test_prep = test_prep.drop(['PromoInterval','Store'],axis=1).reset_index(drop=True) ",No,4,32.0 "test_id = test_prep.Id test_prep.drop('Id',axis=1,inplace = True) predict = np.expm1(pipeline.predict(test_prep)) # Remember to make inverse transformation",No,5,48.0 "sub = pd.DataFrame({'Id':test_id,'Sales':predict}).sort_values('Id').reset_index(drop=True) sub.to_csv('submission.csv',index=False)",No,5,25.0 "columns = [""Date"", ""Store"", ""Promo"", ""StateHoliday"", ""SchoolHoliday""]",No,4,21.0 "df = train[columns].append(test[columns]) df.head(3)",No,4,21.0 "fld = 'SchoolHoliday' df = df.sort_values(['Store', 'Date']) get_elapsed(fld, 'After') df = df.sort_values(['Store', 'Date'], ascending=[True, False]) get_elapsed(fld, 'Before')",No,4,9.0 "fld = 'StateHoliday' df = df.sort_values(['Store', 'Date']) get_elapsed(fld, 'After') df = df.sort_values(['Store', 'Date'], ascending=[True, False]) get_elapsed(fld, 'Before')",No,4,9.0 "fld = 'Promo' df = df.sort_values(['Store', 'Date']) get_elapsed(fld, 'After') df = df.sort_values(['Store', 'Date'], ascending=[True, False]) get_elapsed(fld, 'Before')",No,4,9.0 df = df.set_index('Date'),No,5,53.0 "columns = ['SchoolHoliday', 'StateHoliday', 'Promo']",No,4,17.0 "for o in ['Before', 'After']: for p in columns: a = o+p df[a] = df[a].fillna(0).astype(int)",No,3,17.0 "bwd = df[['Store']+columns].sort_index().groupby(""Store"").rolling(7, min_periods=1).sum()'",No,4,60.0 "fwd = df[['Store']+columns].sort_index(ascending=False ).groupby(""Store"").rolling(7, min_periods=1).sum()'",No,4,60.0 "bwd.drop('Store',1,inplace=True) bwd.reset_index(inplace=True)",No,5,10.0 "fwd.drop('Store',1,inplace=True) fwd.reset_index(inplace=True)",No,5,10.0 df.reset_index(inplace=True),No,4,84.0 "df = df.merge(bwd, 'left', ['Date', 'Store'], suffixes=['', '_bw']) df = df.merge(fwd, 'left', ['Date', 'Store'], suffixes=['', '_fw'])",No,5,32.0 "df.drop(columns,1,inplace=True)",No,5,10.0 "df[""Date""] = pd.to_datetime(df.Date)",No,5,16.0 "joined = join_df(joined, df, ['Store', 'Date'])",No,5,32.0 "joined_test = join_df(joined_test, df, ['Store', 'Date'])",No,5,32.0 joined = joined[joined.Sales!=0],No,5,14.0 "joined.reset_index(inplace=True) joined_test.reset_index(inplace=True)",No,5,84.0 "pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None)",No,5,23.0 "train_df = joined test_df = joined_test",No,5,77.0 train_df.head().T,No,5,41.0 "print(test_df.shape) test_df.head()",No,3,41.0 n = len(train_df); n,No,5,77.0 "idx = np.random.permutation(range(n))[:2000] idx.sort() small_train_df = train_df.iloc[idx[:1000]] small_test_df = train_df.iloc[idx[1000:]] small_cont_vars = ['CompetitionDistance','Mean_Humidity'] small_cat_vars = ['Store','DayOfWeek','PromoInterval'] small_train_df = small_train_df[small_cat_vars + small_cont_vars + ['Sales']] small_test_df = small_test_df[small_cat_vars + small_cont_vars + ['Sales']]",No,4,14.0 small_train_df.head(),No,5,41.0 small_test_df.head(),No,5,41.0 "categorify = Categorify(small_cat_vars, small_cont_vars) categorify(small_train_df) categorify(small_test_df, test=True)",No,5,20.0 small_train_df.PromoInterval.cat.categories,No,5,57.0 "# we convert to categories then add 1 to -1 (NaNs) to turn it to zero because you can not look up 1 in an embedding matrix small_train_df['PromoInterval'].cat.codes[:5]",No,5,41.0 "fill_missing = FillMissing(small_cat_vars, small_cont_vars) fill_missing(small_train_df) fill_missing(small_test_df, test=True)",No,5,17.0 "# find any missing values, create a column called ""_na"" and set it to True any time it is missing # then replace the empty value with the median of CompetitionDistance because it needs to be a continues varaiable small_train_df[small_train_df['CompetitionDistance_na'] == True]'",No,5,14.0 "len(train_df),len(test_df)",No,5,58.0 "# as seen above, create pre processers fill missing, categorify # and normalize (normalize: for any continous var subtract the mean and divide by std) procs=[FillMissing, Categorify, Normalize]",No,5,77.0 "# name your category variables, keep some continues variables like ""day"" as cat because # as a cat var it will create an embedding matrix and the different days of the month will create different behavors cat_vars = ['Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen', 'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear', 'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw', 'SchoolHoliday_fw', 'SchoolHoliday_bw'] # name your continues variables cont_vars = ['CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC', 'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', 'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE', 'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday']'",No,4,14.0 "# dependant var dep_var = 'Sales' # the final df to pass in will be the cat_vars, cont_vars, dep_var, and date, date will be used to create the validation set, #it will be the same number of records at the end of the time period as the test set from kaggle df = train_df[cat_vars + cont_vars + [dep_var,'Date']].copy()",No,3,14.0 "test_df['Date'].min(), test_df['Date'].max()",No,5,40.0 "cut = train_df['Date'][(train_df['Date'] == train_df['Date'][len(test_df)])].index.max() cut",No,5,14.0 valid_idx = range(cut),No,4,13.0 "# finally, lets look df[dep_var].head()",No,5,41.0 "# create databunch data = (TabularList.from_df(df, path='.', cat_names=cat_vars, cont_names=cont_vars, procs=procs,) .split_by_idx(valid_idx) .label_from_df(cols=dep_var, label_cls=FloatList, log=True) .add_test(TabularList.from_df(test_df, path=path, cat_names=cat_vars, cont_names=cont_vars)) .databunch())",No,5,12.0 "max_log_y = np.log(np.max(train_df['Sales'])*1.2) y_range = torch.tensor([0, max_log_y], device=defaults.device)",No,4,21.0 "# Learner learn = tabular_learner(data, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04, y_range=y_range, metrics=exp_rmspe)",No,4,7.0 learn.model,No,3,4.0 len(data.train_ds.cont_names),No,5,58.0 "learn.lr_find() learn.recorder.plot()",No,5,35.0 "learn.fit_one_cycle(5, 1e-3, wd=0.2)",No,5,7.0 learn.save('1'),No,5,50.0 learn.recorder.plot_losses(skip_start=10000),No,5,35.0 learn.load('1');,No,5,30.0 "learn.fit_one_cycle(5, 3e-4)",No,5,7.0 "import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import os",No,5,22.0 "# Loading data directly from CatBoost from catboost.datasets import amazon train, test = amazon()",No,3,21.0 "print(""Train shape: {}, Test shape: {}"".format(train.shape, test.shape))",No,5,58.0 train.head(5),No,5,41.0 test.head(5),No,5,41.0 train.apply(lambda x: len(x.unique())),No,5,54.0 "import itertools target = ""ACTION"" col4train = [x for x in train.columns if x!=target] col1 = 'ROLE_CODE' col2 = 'ROLE_TITLE' pair = len(train.groupby([col1,col2]).size()) single = len(train.groupby([col1]).size()) print(col1, col2, pair, single)'",No,3,71.0 col4train = [x for x in col4train if x!='ROLE_TITLE'],No,5,77.0 "#linear - OHE from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder(sparse=True, dtype=np.float32, handle_unknown='ignore')",Yes,4,20.0 "X = ohe.fit_transform(train[col4train]) y = train[""ACTION""].values",No,4,21.0 "from sklearn.model_selection import cross_validate model = LogisticRegression( penalty='l2', C=1.0, fit_intercept=True, random_state=432, solver = 'liblinear', max_iter = 1000, ) stats = cross_validate(model, X, y, groups=None, scoring='roc_auc', cv=5, n_jobs=2, return_train_score = True) stats = pd.DataFrame(stats) stats.describe().transpose()",No,3,4.0 "X = ohe.fit_transform(train[col4train]) y = train[""ACTION""].values X_te = ohe.transform(test[col4train]) model.fit(X,y) predictions = model.predict_proba(X_te)[:,1] submit = pd.DataFrame() submit[""Id""] = test[""id""] submit[""ACTION""] = predictions submit.to_csv(""submission.csv"", index = False)",Yes,3,7.0 "# Loading data directly from CatBoost from catboost.datasets import amazon train, test = amazon() target = ""ACTION"" col4train = [x for x in train.columns if x not in [target, ""ROLE_TITLE""]] y = train[target].values",No,3,13.0 "from sklearn.model_selection import StratifiedKFold from sklearn.metrics import roc_auc_score from sklearn.ensemble import ExtraTreesClassifier #our small helper function, returns ExtraTrees instance def get_model(): params = { ""n_estimators"":300, ""n_jobs"": 3, ""random_state"":5436, } return ExtraTreesClassifier(**params)",No,3,59.0 "from sklearn.base import BaseEstimator, TransformerMixin class TargetEncoding(BaseEstimator, TransformerMixin): def __init__(self, columns_names ): self.columns_names = columns_names self.learned_values = {} self.dataset_mean = np.nan def fit(self, X, y, **fit_params): X_ = X.copy() self.learned_values = {} X_[""__target__""] = y for c in [x for x in X_.columns if x in self.columns_names]: self.learned_values[c] = (X_[[c,""__target__""]] .groupby(c)[""__target__""].mean() .reset_index()) self.dataset_mean = np.mean(y) return self def transform(self, X, **fit_params): transformed_X = X[self.columns_names].copy() for c in transformed_X.columns: transformed_X[c] = (transformed_X[[c]] .merge(self.learned_values[c], on = c, how = 'left') )[""__target__""] transformed_X = transformed_X.fillna(self.dataset_mean) return transformed_X def fit_transform(self, X, y, **fit_params): self.fit(X,y) return self.transform(X)'",No,5,20.0 "skf = StratifiedKFold(n_splits=5, random_state = 5451, shuffle = True) te = TargetEncoding(columns_names=col4train) X_tr = te.fit_transform(train, y).values scores = [] tr_scores = [] for train_index, test_index in skf.split(train, y): train_df, valid_df = X_tr[train_index], X_tr[test_index] train_y, valid_y = y[train_index], y[test_index] model = get_model() model.fit(train_df,train_y) predictions = model.predict_proba(valid_df)[:,1] scores.append(roc_auc_score(valid_y, predictions)) train_preds = model.predict_proba(train_df)[:,1] tr_scores.append(roc_auc_score(train_y, train_preds)) print(""Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}"".format( np.mean(tr_scores), np.mean(scores), np.std(scores) ))",Yes,2,7.0 "scores = [] tr_scores = [] for train_index, test_index in skf.split(train, y): train_df = train.loc[train_index,col4train].reset_index(drop = True) valid_df = train.loc[test_index,col4train].reset_index(drop = True) train_y, valid_y = y[train_index], y[test_index] te = TargetEncoding(columns_names=col4train) X_tr = te.fit_transform(train_df, train_y).values X_val = te.transform(valid_df).values model = get_model() model.fit(X_tr,train_y) predictions = model.predict_proba(X_val)[:,1] scores.append(roc_auc_score(valid_y, predictions)) train_preds = model.predict_proba(X_tr)[:,1] tr_scores.append(roc_auc_score(train_y, train_preds)) print(""Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}"".format( np.mean(tr_scores), np.mean(scores), np.std(scores) ))",Yes,2,7.0 "class TargetEncodingSmoothing(BaseEstimator, TransformerMixin): def __init__(self, columns_names,k, f ): self.columns_names = columns_names self.learned_values = {} self.dataset_mean = np.nan self.k = k # self.f = f # def smoothing_func(self, N): # return 1 / (1 + np.exp(-(N-self.k)/self.f)) def fit(self, X, y, **fit_params): X_ = X.copy() self.learned_values = {} self.dataset_mean = np.mean(y) X_[""__target__""] = y for c in [x for x in X_.columns if x in self.columns_names]: stats = (X_[[c,""__target__""]] .groupby(c)[""__target__""]. agg(['mean', 'size'])) stats[""alpha""] = self.smoothing_func(stats[""size""]) stats[""__target__""] = (stats[""alpha""]*stats[""mean""] + (1-stats[""alpha""])*self.dataset_mean) stats = (stats .drop([x for x in stats.columns if x not in [""__target__"",c]], axis = 1) .reset_index()) self.learned_values[c] = stats self.dataset_mean = np.mean(y) return self def transform(self, X, **fit_params): transformed_X = X[self.columns_names].copy() for c in transformed_X.columns: transformed_X[c] = (transformed_X[[c]] .merge(self.learned_values[c], on = c, how = 'left') )[""__target__""] transformed_X = transformed_X.fillna(self.dataset_mean) return transformed_X def fit_transform(self, X, y, **fit_params): self.fit(X,y) return self.transform(X)'",No,3,20.0 "%matplotlib inline x = np.linspace(0,100,100) plot = pd.DataFrame() te = TargetEncodingSmoothing([], 1,1) plot[""k=1|f=1""] = te.smoothing_func(x) te = TargetEncodingSmoothing([], 33,5) plot[""k=33|f=5""] = te.smoothing_func(x) te = TargetEncodingSmoothing([], 66,15) plot[""k=66|f=15""] = te.smoothing_func(x) plot.plot(figsize = (15,8))",No,5,81.0 "scores = [] tr_scores = [] for train_index, test_index in skf.split(train, y): train_df = train.loc[train_index,col4train].reset_index(drop = True) valid_df = train.loc[test_index,col4train].reset_index(drop = True) train_y, valid_y = y[train_index], y[test_index] te = TargetEncodingSmoothing( columns_names= col4train, k = 3, f = 1.5 ) X_tr = te.fit_transform(train_df, train_y).values X_val = te.transform(valid_df).values model = get_model() model.fit(X_tr,train_y) predictions = model.predict_proba(X_val)[:,1] scores.append(roc_auc_score(valid_y, predictions)) train_preds = model.predict_proba(X_tr)[:,1] tr_scores.append(roc_auc_score(train_y, train_preds)) print(""Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}"".format( np.mean(tr_scores), np.mean(scores), np.std(scores) ))",Yes,1,7.0 "def get_CV_target_encoding(data, y, encoder, cv = 5): skfTE = StratifiedKFold(n_splits=cv, random_state = 545167, shuffle = True) result = [] for train_indexTE, test_indexTE in skfTE.split(data, y): encoder.fit(data.iloc[train_indexTE,:].reset_index(drop = True), y[train_indexTE]) tmp = encoder.transform(data.iloc[test_indexTE,:].reset_index(drop = True)) tmp[""index""] = test_indexTE result.append(tmp) result = pd.concat(result, ignore_index = True) result = result.sort_values('index').reset_index(drop = True).drop('index', axis = 1) return result'",Yes,2,20.0 "scores = [] tr_scores = [] for train_index, test_index in skf.split(train, y): train_df = train.loc[train_index,col4train].reset_index(drop = True) valid_df = train.loc[test_index,col4train].reset_index(drop = True) train_y, valid_y = y[train_index], y[test_index] te = TargetEncodingSmoothing( columns_names= col4train, k = 3, f = 1.5 ) X_tr = get_CV_target_encoding(train_df, train_y, te, cv = 5) te.fit(train_df, train_y) X_val = te.transform(valid_df).values model = get_model() model.fit(X_tr,train_y) predictions = model.predict_proba(X_val)[:,1] scores.append(roc_auc_score(valid_y, predictions)) train_preds = model.predict_proba(X_tr)[:,1] tr_scores.append(roc_auc_score(train_y, train_preds)) print(""Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}"".format( np.mean(tr_scores), np.mean(scores), np.std(scores) ))",Yes,1,7.0 "class TargetEncodingExpandingMean(BaseEstimator, TransformerMixin): def __init__(self, columns_names): self.columns_names = columns_names self.learned_values = {} self.dataset_mean = np.nan def fit(self, X, y, **fit_params): X_ = X.copy() self.learned_values = {} self.dataset_mean = np.mean(y) X_[""__target__""] = y for c in [x for x in X_.columns if x in self.columns_names]: stats = (X_[[c,""__target__""]] .groupby(c)[""__target__""] .agg(['mean', 'size'])) # stats[""__target__""] = stats[""mean""] stats = (stats .drop([x for x in stats.columns if x not in [""__target__"",c]], axis = 1) .reset_index()) self.learned_values[c] = stats return self def transform(self, X, **fit_params): transformed_X = X[self.columns_names].copy() for c in transformed_X.columns: transformed_X[c] = (transformed_X[[c]] .merge(self.learned_values[c], on = c, how = 'left') )[""__target__""] transformed_X = transformed_X.fillna(self.dataset_mean) return transformed_X def fit_transform(self, X, y, **fit_params): self.fit(X,y) #Expanding mean transform X_ = X[self.columns_names].copy().reset_index(drop = True) X_[""__target__""] = y X_[""index""] = X_.index X_transformed = pd.DataFrame() for c in self.columns_names: X_shuffled = X_[[c,""__target__"", ""index""]].copy() X_shuffled = X_shuffled.sample(n = len(X_shuffled),replace=False) X_shuffled[""cnt""] = 1 X_shuffled[""cumsum""] = (X_shuffled .groupby(c,sort=False)['__target__'] .apply(lambda x : x.shift().cumsum())) X_shuffled[""cumcnt""] = (X_shuffled .groupby(c,sort=False)['cnt'] .apply(lambda x : x.shift().cumsum())) X_shuffled[""encoded""] = X_shuffled[""cumsum""] / X_shuffled[""cumcnt""] X_shuffled[""encoded""] = X_shuffled[""encoded""].fillna(self.dataset_mean) X_transformed[c] = X_shuffled.sort_values(""index"")[""encoded""].values return X_transformed'",No,4,20.0 "scores = [] tr_scores = [] for train_index, test_index in skf.split(train, y): train_df = train.loc[train_index,col4train].reset_index(drop = True) valid_df = train.loc[test_index,col4train].reset_index(drop = True) train_y, valid_y = y[train_index], y[test_index] te = TargetEncodingExpandingMean(columns_names=col4train) X_tr = te.fit_transform(train_df, train_y) X_val = te.transform(valid_df).values model = get_model() model.fit(X_tr,train_y) predictions = model.predict_proba(X_val)[:,1] scores.append(roc_auc_score(valid_y, predictions)) train_preds = model.predict_proba(X_tr)[:,1] tr_scores.append(roc_auc_score(train_y, train_preds)) print(""Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}"".format( np.mean(tr_scores), np.mean(scores), np.std(scores) ))",Yes,2,7.0 "train[col4train] = train[col4train].values.astype(str) test[col4train] = test[col4train].values.astype(str) from itertools import combinations new_col4train = col4train for c1,c2 in combinations(col4train, 2): name = ""{}_{}"".format(c1,c2) new_col4train.append(name) train[name] = train[c1] + ""_"" + train[c2] test[name] = test[c1] + ""_"" + test[c2]",No,3,78.0 "print(train[new_col4train].shape, test[new_col4train].shape) train[new_col4train].head(5)",No,4,58.0 train[new_col4train].apply(lambda x: len(x.unique())),No,5,54.0 "scores = [] tr_scores = [] for train_index, test_index in skf.split(train, y): train_df = train.loc[train_index,new_col4train].reset_index(drop = True) valid_df = train.loc[test_index,new_col4train].reset_index(drop = True) train_y, valid_y = y[train_index], y[test_index] te = TargetEncodingExpandingMean(columns_names=new_col4train) X_tr = te.fit_transform(train_df, train_y) X_val = te.transform(valid_df) te2 = TargetEncodingSmoothing( columns_names= new_col4train, k = 3, f = 1.5, ) X_tr2 = get_CV_target_encoding(train_df, train_y, te2, cv = 5) te2.fit(train_df, train_y) X_val2 = te2.transform(valid_df) X_tr = pd.concat([X_tr, X_tr2], axis = 1) X_val = pd.concat([X_val, X_val2], axis = 1) model = get_model() model.fit(X_tr,train_y) predictions = model.predict_proba(X_val)[:,1] scores.append(roc_auc_score(valid_y, predictions)) train_preds = model.predict_proba(X_tr)[:,1] tr_scores.append(roc_auc_score(train_y, train_preds)) print(""Train AUC score: {:.4f} Valid AUC score: {:.4f}, STD: {:.4f}"".format( np.mean(tr_scores), np.mean(scores), np.std(scores) ))",Yes,1,7.0 "te = TargetEncodingExpandingMean(columns_names=new_col4train) X_tr = te.fit_transform(train[new_col4train], y) X_val = te.transform(test[new_col4train]) te2 = TargetEncodingSmoothing( columns_names= new_col4train, k = 3, f = 1.5, ) X_tr2 = get_CV_target_encoding(train[new_col4train], y, te2, cv = 5) te2.fit(train[new_col4train], y) X_val2 = te2.transform(test[new_col4train]) X = pd.concat([X_tr, X_tr2], axis = 1) X_te = pd.concat([X_val, X_val2], axis = 1) model = get_model() model.fit(X,y) predictions = model.predict_proba(X_te)[:,1] submit = pd.DataFrame() submit[""Id""] = test[""id""] submit[""ACTION""] = predictions submit.to_csv(""submission.csv"", index = False)",Yes,1,7.0 "import numpy as np import pandas as pd import os print(os.listdir(""../input"")) ",No,5,88.0 "train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv')",No,5,45.0 "id=test.iloc[:,0].values test.drop('id',axis=1)",No,5,10.0 id,No,5,53.0 "X = train.iloc[:, 1:11].values y = train.iloc[:, 0].values",No,5,21.0 "from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)",No,5,13.0 "from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test)",No,5,18.0 "# Fitting Random Forest Classification to the Training set from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(n_estimators = 99, criterion = 'entropy', random_state = 0) classifier.fit(X_train, y_train)",Yes,3,7.0 y_pred = classifier.predict(X_test),No,5,48.0 "from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred)",No,5,49.0 cm,No,5,53.0 "#for calculating accuracy (131+6090)/(131+6090+92+241)",No,5,53.0 "test.drop(['id'], axis=1, inplace = True)",No,5,10.0 test,No,5,41.0 test_pred = classifier.predict(test),No,5,48.0 test_pred,No,5,41.0 "submission = pd.DataFrame({'Id':id,'Action':test_pred})",No,5,12.0 "final_submission=submission.iloc[0:58921,:].values",No,5,14.0 final_submission,No,5,41.0 "final_submission = pd.DataFrame({'Id':final_submission[:,0],'Action':final_submission[:,-1]})",No,5,12.0 "filename = 'Amazon Employee Access .csv' final_submission.to_csv(filename,index=False) print('Saved file: ' + filename)",No,4,25.0 "import seaborn as sns import numpy as np # linear algebra import pandas as pd from sklearn.metrics import confusion_matrix from sklearn.metrics import f1_score from sklearn.metrics import accuracy_score import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error from sklearn.metrics import r2_score from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestClassifier import math from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.naive_bayes import GaussianNB from xgboost import XGBClassifier",No,5,22.0 "train_file_path = '/kaggle/input/covid19-global-forecasting-week-2/train.csv' test_file_path = '/kaggle/input/covid19-global-forecasting-week-2/test.csv' train_dataset = pd.read_csv(train_file_path) test_dataset = pd.read_csv(test_file_path) ",No,5,45.0 "print(""Understanding of Train Dataset:\ \ \ "") print('Train Dataset has following states:\ ') province_state = train_dataset['Province_State'].unique() print(province_state) print('\ \ \ Train Dataset has following Country Region:\ ') country_region = train_dataset['Country_Region'].unique() print(country_region) print('\ \ \ Train Dataset has records of following dates:\ ') dates = train_dataset['Date'].unique() print(dates) #convert to mm/dd/yyyy train_dataset['Date'] = pd.to_datetime(train_dataset['Date']) print('\ Train Dataset has following Date Range:') print(pd.date_range(start=train_dataset['Date'].min(), end=train_dataset['Date'].max())) '",No,3,16.0 "print(""Understanding of Test Dataset:\ \ \ "") print('Test Dataset has following states:\ ') province_state = test_dataset['Province_State'].unique() print(province_state) print('\ \ \ Test Dataset has following Country Region:\ ') country_region = test_dataset['Country_Region'].unique() print(country_region) print('\ \ \ Test Dataset has records of following dates:\ ') dates = test_dataset['Date'].unique() print(dates) #convert to mm/dd/yyyy test_dataset['Date'] = pd.to_datetime(test_dataset['Date']) print('\ Test Dataset has following Date Range:') print(pd.date_range(start=test_dataset['Date'].min(), end=test_dataset['Date'].max())) '",No,4,57.0 "print(""Train Dataset Graphical Representation of Counrtry Region w.r.t. Confirmed Cases"") show_cumulatively = train_dataset.groupby(by='Country_Region')[['ConfirmedCases','Fatalities']].max().reset_index() plt.figure(figsize=(20,10)) #sns.set() sns.barplot(x='ConfirmedCases',y='Country_Region',data=show_cumulatively[show_cumulatively['ConfirmedCases'] != 0].sort_values(by='ConfirmedCases',ascending=False).head(50)) '",No,4,81.0 " print(""Train Dataset Graphical Representation of Counrtry Region w.r.t. Fatalities"") plt.figure(figsize=(20,10)) sns.barplot(x='Fatalities',y='Country_Region',data=show_cumulatively[show_cumulatively['Fatalities'] != 0].sort_values(by='Fatalities',ascending=False).head(50))'",No,4,81.0 "print('Those Country Regions of Train Dataset whose Confirmed Cases have Fatalities') non_fatalities_train_df = train_dataset[train_dataset['Fatalities'] != 0] non_fatalities_train_df[['Country_Region','Date','ConfirmedCases','Fatalities']]",No,4,71.0 "print('Those Country Regions whose Confirmed Cases have not Fatalities') non_fatalities_train_df = train_dataset[train_dataset['Fatalities'] == 0] non_fatalities_train_df[['Country_Region','Date','ConfirmedCases','Fatalities']]",No,3,71.0 "b""print('The value count of Country Regions of Non-Null Province States in Train Dataset')\nprint(train_dataset[~train_dataset['Province_State'].isnull()]['Country_Region'].value_counts())\n\n\nprint('\\n\\n\\nThe value count of Country Regions of Null Province States in Train Dataset')\nprint(train_dataset[train_dataset['Province_State'].isnull()]['Country_Region'].value_counts())""",No,5,39.0 "b""print('The value count of Country Regions of Non-Null Province States in Test Dataset')\nprint(test_dataset[~test_dataset['Province_State'].isnull()]['Country_Region'].value_counts())\n\n\nprint('\\n\\n\\nThe value count of Country Regions of Null Province States in Test Dataset')\nprint(test_dataset[test_dataset['Province_State'].isnull()]['Country_Region'].value_counts())""",No,5,39.0 "print(""Train dataset before pre-processing:\ "") print(train_dataset.head()) train_dataset = train_dataset.fillna('Enpyty_value') print(""\ \ \ Train dataset after pre-processing:\ "") print(train_dataset.head())'",No,4,17.0 "print(""Test dataset before pre-processing:\ "") print(test_dataset.head()) test_dataset = test_dataset.fillna('Enpyty_value') print(""\ \ \ Test dataset after pre-processing:\ "") print(test_dataset.head())'",No,4,17.0 "labelEncoder = LabelEncoder() train_dataset['Date'] = pd.to_datetime(train_dataset['Date']).dt.strftime(""%m%d"").astype(int) train_dataset['Date'] -= 122 test_dataset['Date'] = pd.to_datetime(test_dataset['Date']).dt.strftime(""%m%d"").astype(int) test_dataset['Date'] -= 122 train_dataset.Province_State = labelEncoder.fit_transform(train_dataset.Province_State) train_dataset.Country_Region = labelEncoder.fit_transform(train_dataset.Country_Region) test_dataset.Province_State = labelEncoder.fit_transform(test_dataset.Province_State) test_dataset.Country_Region = labelEncoder.fit_transform(test_dataset.Country_Region) print('\ \ \ Train Dataset After Encoding') print(train_dataset.head(5)) print('\ \ \ Test Dataset After Encoding') print(test_dataset.head(5)) '",No,3,8.0 "#We don't need to convert it into vector because it is alreayd in vector form. See following print(train_dataset.head()) print(test_dataset.head()) ",No,4,84.0 "X = train_dataset[['Province_State','Country_Region','Date']] y = train_dataset[['ConfirmedCases','Fatalities']] classifier = BinaryRelevance(GaussianNB()) # train classifier.fit(X, y[['Fatalities']]) # predict predictions_fatalities = classifier.predict(test_dataset[['Province_State','Country_Region','Date']]) # train classifier.fit(X, y[['ConfirmedCases']]) # predict predictions_confirmed_cases = classifier.predict(test_dataset[['Province_State','Country_Region','Date']]) ",No,3,7.0 "output_confirmed_cases_df = pd.DataFrame(data=predictions_confirmed_cases.toarray()) output_fatalities_df = pd.DataFrame(data=predictions_fatalities.toarray()) output_confirmed_cases_df = output_confirmed_cases_df.rename(columns={0: ""ConfirmedCases""}) output_fatalities_df = output_fatalities_df.rename(columns={0: ""Fatalities""})",No,5,55.0 test_dataset.ForecastId,No,4,84.0 "result.to_csv('submission.csv', index=False) ",No,5,25.0 "filepath= '/kaggle/input/amazon-employee-access-challenge/train.csv' traindata= pd.read_csv(filepath) filepath2= '/kaggle/input/amazon-employee-access-challenge/test.csv' testdata= pd.read_csv(filepath2) testdatacopy=testdata traindata.head()",No,4,45.0 "##Thus we see that there are no null values sns.heatmap(traindata.isnull(),yticklabels=False,cbar=False,cmap='viridis')",No,5,80.0 "#Now we plot the number of people who were granted access sns.set_style('whitegrid') sns.countplot(x='ACTION',data=traindata,palette='RdBu_r')",No,5,33.0 "y=traindata['ACTION'] x=traindata.drop('ACTION',axis=1) #Splitting training and testing data x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.70,test_size=0.30, random_state=0)",No,4,21.0 "#Logistic Regression LogisticRegressor = LogisticRegression(max_iter=10000) LogisticRegressor.fit(x_train, y_train) y_predicted = LogisticRegressor.predict(x_test) mse = mean_squared_error(y_test, y_predicted) r = r2_score(y_test, y_predicted) mae = mean_absolute_error(y_test,y_predicted) print(""Mean Squared Error:"",mse) print(""R score:"",r) print(""Mean Absolute Error:"",mae) print('f1 score:') print(f1_score(y_test,y_predicted)) print('accuracy score:') print(accuracy_score(y_test,y_predicted)) '",Yes,2,7.0 "# Random Forest rf = RandomForestClassifier() rf.fit(x_train,y_train); y_predicted_r = rf.predict(x_test) mse = mean_squared_error(y_test, y_predicted_r) r = r2_score(y_test, y_predicted_r) mae = mean_absolute_error(y_test,y_predicted_r) print(""Mean Squared Error:"",mse) print(""R score:"",r) print(""Mean Absolute Error:"",mae) print('f1 score:') print(f1_score(y_test,y_predicted_r)) print('accuracy score:') print(accuracy_score(y_test,y_predicted_r)) '",Yes,2,7.0 "# Decision Tree - CART regressor = DecisionTreeRegressor(random_state = 0) regressor.fit(x_train, y_train) y_predicted_d = regressor.predict(x_test) mse = mean_squared_error(y_test, y_predicted_d) r = r2_score(y_test, y_predicted_d) mae = mean_absolute_error(y_test,y_predicted_d) print(""Mean Squared Error:"",mse) print(""R score:"",r) print(""Mean Absolute Error:"",mae) print('f1 score:') print(f1_score(y_test,y_predicted_d)) print('accuracy score:') print(accuracy_score(y_test,y_predicted_d)) '",Yes,2,7.0 "#XGBClassifier xgboost = XGBClassifier(n_estimators=1000) xgboost.fit(x_train,y_train) xg_pred = xgboost.predict(x_test) msee21 = mean_squared_error(y_test, xg_pred) ra21 = r2_score(y_test, xg_pred) maee21 = mean_absolute_error(y_test,xg_pred) print(""Mean Squared Error:"",msee21) print(""R score:"",ra21) print(""Mean Absolute Error:"",maee21) print('f1 score:') print(f1_score(y_test,xg_pred)) print('accuracy score:') print(accuracy_score(y_test,xg_pred))'",No,2,7.0 "#SVM svclassifier = SVC(kernel='linear') svclassifier.fit(x_train, y_train) y_pred2 = svclassifier.predict(x_test) mseew = mean_squared_error(y_test, y_pred2) ra = r2_score(y_test, y_pred2) maeew = mean_absolute_error(y_test,y_pred2) print(""Mean Squared Error:"",mseew) print(""R score:"",ra) print(""Mean Absolute Error:"",maeew) print('f1 score:') print(f1_score(y_test,y_pred2)) print('accuracy score:') print(accuracy_score(y_test,y_pred2))'",Yes,2,7.0 "#Naive Bayes gnb = GaussianNB() y_preed = gnb.fit(x_train, y_train).predict(x_test) ms = mean_squared_error(y_test, y_preed) rae = r2_score(y_test, y_preed) mew = mean_absolute_error(y_test,y_preed) print(""Mean Squared Error:"",ms) print(""R score:"",rae) print(""Mean Absolute Error:"",mew) print('f1 score:') print(f1_score(y_test,y_preed)) print('accuracy score:') print(accuracy_score(y_test,y_preed))'",Yes,2,7.0 "#KNN math.sqrt(len(y_test)) #Therefore n neighbors=99 ",No,5,53.0 "#KNN classify= KNeighborsClassifier (n_neighbors=99, p =2, metric= 'euclidean') classify.fit(x_train,y_train) ypred1=classify.predict(x_test) msee = mean_squared_error(y_test, ypred1) r = r2_score(y_test, y_predicted_d) maee = mean_absolute_error(y_test,ypred1) print(""Mean Squared Error:"",msee) print(""R score:"",r) print(""Mean Absolute Error:"",maee) print('f1 score:') print(f1_score(y_test,ypred1)) print('accuracy score:') print(accuracy_score(y_test,ypred1))'",No,2,7.0 "testdata=testdata.drop('id',axis=1)",No,5,10.0 "# Random Forest rf = RandomForestClassifier() rf.fit(x,y) Prediction = rf.predict(testdata) ",Yes,3,7.0 "predictionlist=Prediction.tolist() Passengerid=testdatacopy['id'].tolist() output=pd.DataFrame(list(zip(Passengerid, predictionlist)), columns=['id','Action']) output.head() output.to_csv('my_submission(AmazonEmployeeAccess).csv', index=False) ",No,3,25.0 "plt.figure(figsize=(25,12)) mask = np.zeros_like(df.corr()) mask[np.triu_indices_from(mask)] = True sns.heatmap(df.corr(), cmap='coolwarm', vmax=1.0, vmin=-1.0 , mask = mask, linewidths=2.5) plt.show() ",No,5,80.0 df_train = df[df['part']== 'train'],No,5,14.0 "cols_train = ['Store', 'DayOfWeek', 'Date', 'Sales', 'Promo','PromoOpen', # 'shift_sales','shift_t7_sales', 'shift_t30_sales','shift_customer','shift_t7_customer', 'shift_t30_customer', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance', 'Promo2', 'Month', 'Year', 'Day','IsPromoMonth', # 'Sales_DayOfWeek', 'Sales_Promo', 'Sales_Promo2', 'Sales_Month', 'Sales_Year', 'Sales_Day', 'Sales_StateHoliday', 'Sales_StoreType', # 'Sales_Assortment', 'Customers_DayOfWeek', #'isBeforeCompetition', 'Customers_Promo', 'Customers_Promo2', 'Customers_Month', # 'Customers_Year', 'Customers_Day', 'Customers_StateHoliday','Customers_StoreType', 'Customers_Assortment', # 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Sales_StrType_DayOfWeek','SalesPerCustomer_StrType_Assortment', 'Customers_StrType_Promo2', 'Sales_StrType_Promo', 'Sales_StrType_Promo2', 'Sales_StrType_Month','Customers_StrType_StateHoliday', 'Sales_StrType_Year', 'Sales_StrType_Day', 'Sales_StrType_StateHoliday','SalesPerCustomer_StrType_Promo2', 'Sales_StrType_Assortment', 'Customers_StrType_DayOfWeek', 'Customers_StrType_Promo', 'Customers_StrType_Month', 'Customers_StrType_Year', 'Customers_StrType_Day', 'SalesPerCustomer_StrType_StateHoliday', 'Customers_StrType_Assortment', 'SalesPerCustomer_StrType_DayOfWeek', 'SalesPerCustomer_StrType_Promo', 'SalesPerCustomer_StrType_Month', 'SalesPerCustomer_StrType_Year', 'SalesPerCustomer_StrType_Day', # 'Sales_StrType_Quarter', 'Customers_StrType_Quarter', 'Sales_Year_Quarter','Customers_Year_Quarter', 'Customers_Quarter','Sales_Quarter', # 'Quarter' ] ",No,5,77.0 "params = {""objective"": ""reg:linear"", # for linear regression ""booster"" : ""gbtree"", # use tree based models ""eta"": 0.02, # learning rate ""max_depth"": 11, # maximum depth of a tree ""subsample"": 0.9, # Subsample ratio of the training instances ""colsample_bytree"": 0.7, # Subsample ratio of columns when constructing each tree ""silent"": 1, # silent mode ""seed"": 10, # Random number seed 'tree_method': 'gpu_hist', } num_boost_round = 800 def rmspe_xg(yhat, y): y = np.expm1(y.get_label()) yhat = np.expm1(yhat) return ""rmspe"", rmspe(y,yhat) import xgboost as xg",No,4,23.0 "tmp= pd.pivot_table(data, ['Date'], ""Store"", aggfunc=""count"").reset_index().sort_values('Date', ascending=False).head(300) top_stores = tmp[""Store""].values'",No,3,8.0 "# from sklearn.manifold import TSNE # from sklearn.preprocessing import StandardScaler def process(x, cols=None, all_stores=False): x.sort_values(""Date"",inplace=True) # scaler = StandardScaler() if cols is None: cols = x.columns x = x.fillna(x.median()) # for i in x.columns[(x.dtypes.values == np.dtype('float64'))]: # if i not in ['Id', 'Promo2SinceWeek', 'Promo2SinceYear','CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Sales', # 'Quarter', 'WeekOfYear', 'PromoOpen', 'Promo2SinceWeek', 'Promo2SinceYear']: # x[i] = np.round(np.log1p(x[i]),2) x_train = x[x[""Date""]<=""2015-06-12""][cols].copy() x_test = x[x[""Date""]>""2015-06-12""][cols].copy() store_test = x_test['Store'].unique().tolist() x_train = x_train[(x_train['Store'].isin(store_test))] y_train = np.log(x_train['Sales']) if all_stores: rmv = ['Date', 'Sales'] else: rmv = ['Date', 'Sales', 'Store'] x_train= x_train.drop(rmv, 1) x_train = pd.get_dummies(x_train) x_train_arr = x_train.values x_test_arr = pd.get_dummies(x_test.drop(rmv, 1)).values #scaler.fit(x_train_arr) #x_train_arr = scaler.transform(x_train_arr) #x_test_arr = scaler.transform(x_test_arr) #reduc = TSNE(n_components=2) #reduc.fit(x_train_arr) #x_train_arr = reduc.transform(x_train_arr) #x_test_arr = reduc.transform(x_test_arr) return x_train.columns, x_train_arr, y_train, x_test, x_test_arr'",Yes,4,1.0 "fig, ax = plt.subplots(5, 2, figsize=(25, 15)) X = df_train[df_train['Store'].isin(top_stores)] .copy() X_train_col, X_train_arr, Y_train, X_test, X_test_arr = process(X, cols_train, True) dtrain = xgb.DMatrix(X_train_arr, Y_train) estimator = xgb.train(params, dtrain, num_boost_round, feval=rmspe_xg,) Y_pred = estimator.predict(xgb.DMatrix(X_test_arr)) X_test[""Pred""] = np.exp(Y_pred) scores = np.round(mean_squared_error(X_test['Sales'], X_test[""Pred""])) cpt = 0 for i in top_stores[:5]: x_train = df_train[df_train[""Store""]==i] x_test = X_test[X_test[""Store""]==i] ax[cpt, 0].plot(x_train[""Date""], x_train[""Sales""]) ax[cpt, 0].plot(x_test[""Date""], x_test[""Pred""]) ax[cpt, 0].set_title(i) ax[cpt, 1].scatter(x_test[""Date""].values, x_test['Sales'].values - x_test[""Pred""].values) ax[cpt, 1].plot(x_test[""Date""], [0 for _ in range(len(x_test))]) ax[cpt, 1].set_title( np.round(mean_squared_error(X_test['Sales'], X_test[""Pred""]))) #feat_importances = pd.Series(reg.feature_importances_, index=X_train_col) #feat_importances.nlargest(10).sort_values(ascending = True).plot(kind='barh', ax=ax[cpt, 2]) # ax[cpt, 2].set_xlabel('importance') cpt+=1 plt.tight_layout() print (np.mean(scores))'",Yes,3,1.0 "import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib.image as mpimg import seaborn as sns %matplotlib inline np.random.seed(2) from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix import itertools from keras.utils.np_utils import to_categorical # convert to one-hot-encoding from keras.models import Sequential, save_model, load_model from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D from keras.optimizers import RMSprop, Adam from keras.preprocessing.image import ImageDataGenerator from keras.callbacks import ReduceLROnPlateau, TensorBoard, ModelCheckpoint sns.set(style='white', context='notebook', palette='deep')",No,5,23.0 "### load data train = pd.read_csv('../input/training/training.csv') test = pd.read_csv('../input/test/test.csv') sample = pd.read_csv('../input/SampleSubmission.csv') look_id = pd.read_csv('../input/IdLookupTable.csv')",No,5,45.0 train.tail().T,No,5,41.0 "train.fillna(method='ffill', inplace=True) train.tail().T",No,4,17.0 train.isnull().any().describe(),No,4,40.0 "Img = [] for i in range(7049): img = train[""Image""][i].split(' ') img = ['0' if x=='' else x for x in img] Img.append(img)'",No,4,17.0 "PATH_WEEK2='/kaggle/input/covid19-global-forecasting-week-2' df_train = pd.read_csv(f'{PATH_WEEK2}/train.csv') df_test = pd.read_csv(f'{PATH_WEEK2}/test.csv') df_train.head() df_test.head() df_train.rename(columns={'Country_Region':'Country'}, inplace=True) df_test.rename(columns={'Country_Region':'Country'}, inplace=True) df_train.rename(columns={'Province_State':'State'}, inplace=True) df_test.rename(columns={'Province_State':'State'}, inplace=True) df_train['Date'] = pd.to_datetime(df_train['Date'], infer_datetime_format=True) df_test['Date'] = pd.to_datetime(df_test['Date'], infer_datetime_format=True) df_train.info() df_test.info() y1_Train = df_train.iloc[:, -2] y1_Train.head() y2_Train = df_train.iloc[:, -1] y2_Train.head() EMPTY_VAL = ""EMPTY_VAL"" def fillState(state, country): if state == EMPTY_VAL: return country return state '",No,3,45.0 "Y_train = train.drop('Image', axis=1) Y_train = Y_train.values Y_train = np.array(Y_train, dtype='float') Y_train.shape, X_train.shape",No,4,21.0 "# keras CNN # model = Sequential() model.add(Conv2D(filters=32, kernel_size=(5,5), padding = 'same', activation = 'relu', input_shape = (96,96,1))) model.add(MaxPool2D(pool_size = (2,2))) model.add(Dropout(0.25)) model.add(Conv2D(filters=32, kernel_size=(3,3), padding = 'same', activation = 'relu')) model.add(MaxPool2D(pool_size = (2,2))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(256, activation = 'relu')) model.add(Dropout(0.1)) model.add(Dense(30))",No,3,4.0 "optimizer = RMSprop(lr = 0.001, epsilon = 1e-8) optimizer1 =Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) model.compile(optimizer1, loss = ""mse"", metrics = [""accuracy""] )",No,3,4.0 " learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', patience=3, verbose=1, factor=0.5, min_lr=0.00001) tensorboard = TensorBoard(log_dir = './output') modelcheckpoint = ModelCheckpoint(filepath='./optimized_model.h5', monitor=""val_loss"", save_best_only=True, mode=""min"") callback_list = [learning_rate_reduction, tensorboard, modelcheckpoint]'",No,4,28.0 "model1 = Sequential([Flatten(input_shape=(96,96,1)), Dense(128, activation=""relu""), Dropout(0.1), Dense(64, activation=""relu""), Dense(30) ]) model1.compile(optimizer='adam', loss='mse', metrics=['mae','accuracy'])'",No,3,4.0 "batch_size =100 epochs = 50 history = model.fit(X_train, Y_train, epochs = epochs, batch_size=batch_size, callbacks=callback_list, validation_split=0.1, verbose = 2)",No,4,7.0 "# Plot the loss and accuracy curves for training and validation fig, ax = plt.subplots(2,1) ax[0].plot(history.history['loss'], color='b', label=""Training loss"") ax[0].plot(history.history['val_loss'], color='r', label=""validation loss"",axes =ax[0]) legend = ax[0].legend(loc='best', shadow=True) ax[1].plot(history.history['acc'], color='b', label=""Training accuracy"") ax[1].plot(history.history['val_acc'], color='r',label=""Validation accuracy"") legend = ax[1].legend(loc='best', shadow=True)'",No,5,35.0 "#preparing test data timag = [] for i in range(0,1783): timg = test['Image'][i].split(' ') timg = ['0' if x == '' else x for x in timg] timag.append(timg)",No,3,14.0 "X_test = np.array(timag,dtype = 'float') X_test = X_test/255 X_test = X_test.reshape(-1,96,96,1) X_test.shape",No,4,21.0 "opt_model = load_model('./optimized_model.h5') ",No,5,30.0 "pred = model.predict(X_test, batch_size = 100) pred.shape",No,4,48.0 "feature = [] for f in list( look_id['FeatureName']): feature.append(lookid_list.index(f))",No,2,8.0 "rowid = pd.Series(rowid,name = 'RowId') loc = pd.Series(preded,name = 'Location') submission = pd.concat([rowid,loc],axis = 1) submission.to_csv('face_key_detection_submission.csv',index = False)",No,4,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import gc import matplotlib.pyplot as plt from PIL import Image # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory import os print(os.listdir(""../input"")) # Any results you write to the current directory are saved as output.",No,4,22.0 df = pd.read_csv('../input/training/training.csv'),No,5,45.0 "print(df.isnull().any().value_counts(), df.shape) df.dropna(inplace=True) #df.fillna(method = 'ffill',inplace = True) #df.reset_index(drop = True, inplace = True) print(df.isnull().any().value_counts(), df.shape)",No,3,17.0 "df = df.sample(frac=1) img_data = df['Image'].values df.drop('Image', inplace=True, axis=1)",No,4,10.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from keras.models import Sequential from keras.layers import Dense,Flatten,Dropout,Conv2D,MaxPooling2D import tensorflow as tf import matplotlib.pyplot as plt from PIL import Image # Input data files are available in the read-only ""../input/"" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,5,88.0 !unzip ../input/facial-keypoints-detection/training.zip -d train,No,5,44.0 !unzip ../input/facial-keypoints-detection/test.zip -d test,No,5,84.0 "train = pd.read_csv(""../working/train/training.csv"")",No,5,45.0 "test = pd.read_csv(""../working/test/test.csv"")",No,5,45.0 print(test),No,5,41.0 train.head().T,No,5,41.0 "train.fillna(method='ffill',inplace=True)",No,5,17.0 print(train),No,5,41.0 "len(train[""Image""][4].split(' '))'",No,3,40.0 "images = np.ndarray((7049,9216)) for i in range(7049): img = np.array(train[""Image""][i].split(' ')) img = ['0' if x == '' else x for x in img] images[i,:] = img'",No,3,14.0 "Y_test = np.ndarray((1783,9216)) for i in range(1783): img = np.array(test[""Image""][i].split(' ')) img = ['0' if x == '' else x for x in img] Y_test[i,:] = img'",No,4,17.0 "images = images.reshape(-1,96,96,1)",No,5,84.0 "Y_test = Y_test.reshape(-1,96,96,1)",No,5,84.0 images.shape,No,5,58.0 "plt.imshow(images[34].reshape(96,96),cmap='gray')",No,5,84.0 "train.drop('Image',axis=1)",No,5,10.0 "Y_train = np.array(train.drop(""Image"",axis=1),dtype='float')'",No,5,21.0 print(Y_train.shape),No,5,58.0 "model = Sequential() model.add(Conv2D(32,(3,3),input_shape=(96,96,1),padding = 'SAME',activation='relu')) model.add(Conv2D(32,(3,3),padding = 'SAME',activation='relu')) model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2))) model.add(Dropout(0.2)) model.add(Conv2D(64,(3,3),padding = 'SAME',activation='relu')) model.add(Conv2D(64,(3,3),padding = 'SAME',activation='relu')) model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2))) model.add(Dropout(0.2)) model.add(Conv2D(128,(3,3),padding = 'SAME',activation='relu')) model.add(Conv2D(128,(3,3),padding = 'SAME',activation='relu')) model.add(Conv2D(128,(3,3),padding = 'SAME',activation='relu')) model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2))) model.add(Dropout(0.2)) model.add(Conv2D(256,(3,3),padding = 'SAME',activation='relu')) model.add(Conv2D(256,(3,3),padding = 'SAME',activation='relu')) model.add(Conv2D(256,(3,3),padding = 'SAME',activation='relu')) model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2))) model.add(Dropout(0.2)) model.add(Conv2D(512,(3,3),padding = 'SAME',activation='relu')) model.add(Conv2D(512,(3,3),padding = 'SAME',activation='relu')) model.add(Conv2D(512,(3,3),padding = 'SAME',activation='relu')) model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2))) model.add(Dropout(0.2)) model.add(Flatten()) model.add(Dense(units=512,activation='relu')) model.add(Dense(units=30)) model.summary()",No,3,4.0 "model.compile(loss='mean_squared_error',optimizer='adam',metrics=['mae'])",No,3,4.0 "model.fit(images,Y_train,epochs=10,batch_size=256,validation_split=0.2)",No,5,7.0 pred = model.predict(Y_test),No,5,48.0 "lookid_data = pd.read_csv(""/kaggle/input/facial-keypoints-detection/IdLookupTable.csv"")",No,5,45.0 "lookid_list = list(lookid_data['FeatureName']) imageID = list(lookid_data['ImageId']-1) pre_list = list(pred) rowid = lookid_data['RowId'] rowid=list(rowid) feature = [] for f in list(lookid_data['FeatureName']): feature.append(lookid_list.index(f)) preded = [] for x,y in zip(imageID,feature): preded.append(pre_list[x][y]) rowid = pd.Series(rowid,name = 'RowId') loc = pd.Series(preded,name = 'Location') submission = pd.concat([rowid,loc],axis = 1) submission.to_csv('face_key_detection_submission.csv',index = False)",Yes,4,25.0 "train_file = 'training.csv' test_file = 'test.csv' lookup_file = '../input/facial-keypoints-detection/IdLookupTable.csv' train = pd.read_csv(train_file) test = pd.read_csv(test_file) lookup = pd.read_csv(lookup_file) ",No,4,45.0 "import tensorflow as tf import numpy as np import pandas as pd from tensorflow import keras from keras.preprocessing.image import ImageDataGenerator import matplotlib as mpl import matplotlib.pyplot as plt !pip install py7zr from keras.preprocessing.image import load_img,img_to_array from py7zr import unpack_7zarchive import shutil import os shutil.register_unpack_format('7zip', ['.7z'], unpack_7zarchive) ",No,4,87.0 "shutil.unpack_archive('/kaggle/input/cifar-10/train.7z', '/kaggle/working')",No,4,73.0 " train_dir = os.listdir(""./train""); train_dir_len = len(train_dir) print("".\\\\train:\\t"",train_dir_len) print(""files:\\t\\t"",train_dir[:3])'",No,5,88.0 "train_labels = pd.read_csv('/kaggle/input/cifar-10/trainLabels.csv',dtype=str) train_images = pd.DataFrame(columns = ['id','label','path'],dtype=str) test_labels = pd.read_csv('/kaggle/input/cifar-10/sampleSubmission.csv') train_labels.info()",No,4,45.0 "path_base = '/kaggle/working/train/' for index in range(0,train_dir_len): path = path_base + str(index+1)+'.png' if os.path.exists(path): train_images = train_images.append([{ 'id': str(train_labels['id'].iloc[index]),'path': path, 'label':train_labels['label'].iloc[index]}]) train_images.head(2)",No,3,41.0 train_images.head(2),No,5,41.0 "display_groupby = train_images.groupby(['label']).count() display_groupby.head(10)",No,4,60.0 "class_names = ['airplane','automobile','bird','cat','deer','dog','frog','horse','ship','truck'] for name in class_names: index = class_names.index(name) train_images.loc[train_images.label==name,'label'] = str(index) display_groupby = train_images.groupby(['label']).count() display_groupby.head(10)",No,4,60.0 "path_base = '/kaggle/working/train' batch_size = 64 train_data_generator = ImageDataGenerator( rescale=1./255., validation_split=0.2, horizontal_flip=True ) train_generator = train_data_generator.flow_from_dataframe(dataframe=train_images, directory=""./train/"", x_col=""path"", y_col=""label"", subset=""training"", batch_size=batch_size, shuffle=True, target_size=(32,32), class_mode=""categorical"")'",Yes,4,31.0 num_classes = 10,No,5,77.0 "validation_generator = train_data_generator.flow_from_dataframe(dataframe=train_images, directory=""./train/"", x_col=""path"", y_col=""label"", subset=""validation"", batch_size=batch_size, shuffle=True, target_size=(32,32), class_mode=""categorical"")",No,5,84.0 "b""train_size = len(train_generator.filenames)\nvalidation_size = len(validation_generator.filenames)\nprint('validation_size:\\t',validation_size)\nprint('train_size:\\t\\t',train_size)""",No,5,58.0 "index = 0 fig = plt.figure(figsize = (16,10)) for item in train_images.values[:20]: index += 1 plt.subplot(5, 5, index) test_path = item[2] test_image = load_img(test_path, target_size=(32,32)) plt.imshow(test_image) plt.colorbar() plt.grid(False) plt.axis(""off"") plt.title(class_names[int(item[1])]) plt.show()",No,5,84.0 "import warnings warnings.filterwarnings(""ignore"") import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import cross_val_score",No,4,22.0 "# Import dataset df = pd.read_csv('../input/loan-default-prediction/train_v2.csv.zip') df.head()",No,4,45.0 "# Check duplication in dataframe df[df.duplicated()].shape",No,5,38.0 "# The number of each data type in the dataframe df.dtypes.value_counts()",No,5,72.0 "# Loss Distribution fig , ax = plt.subplots() plt.hist(df['loss'], bins = 20, range=(0,100)) ax.set_ylim([0,3000]) plt.show()",No,5,33.0 "# Calculate percent of missing in each row df['num_missing'] = df.isnull().sum(axis = 1)/df.shape[1] # Drop row that percent of missing more than 20% missing_row = df[df['num_missing'] > 0.20].index df.drop(df.index[missing_row], inplace = True) df.shape",No,4,17.0 "# Drop id and num_missing collumn df.drop(columns = ['id','num_missing'], inplace = True)",No,5,10.0 "# Calculate percent of missing in each column col_pct_miss = [] for col in df.columns: percent_miss = np.mean(df[col].isnull())*100 if percent_miss > 0: col_pct_miss.append([col, percent_miss]) col_pct_miss_df = pd.DataFrame(col_pct_miss, columns = ['column_name','% of Missing']).sort_values(by = '% of Missing', ascending = False) col_pct_miss_df",No,4,17.0 "# Impute missing value in numeric columns with median numeric_cols = df.select_dtypes(include=['number']).columns.values for col in numeric_cols: if col in list(col_pct_miss_df.column_name) : med = df[col].median() df[col] = df[col].fillna(med)",No,5,17.0 "# Impute missing value in categorical columns with mode not_numeric_cols = df.select_dtypes(exclude=['number']).columns.values for col in not_numeric_cols: if col in list(col_pct_miss_df.column_name): mode = df[col].mode()[0] df[col] = df[col].fillna(mode)",No,5,17.0 "# Check missing value df.isnull().sum().value_counts()",No,5,39.0 "# Drop Highly Corelated Columns # Create correlation matrix corr_matrix = df.corr().abs() # Select upper triangle of correlation matrix upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) # Find index of feature columns with correlation greater than 0.95 to_drop = [column for column in upper.columns if any(upper[column] > 0.95)] df.drop(columns = to_drop, inplace = True)",No,5,10.0 "#Drop Repetitive Columns num_rows = df.shape[0] rep_cols = [] for col in df.loc[:, df.columns != 'loss'].columns : cnts = df[col].value_counts() top_pct = (cnts/num_rows).iloc[0] if top_pct > 0.80: rep_cols.append([col,top_pct]) rep_col_df = pd.DataFrame(rep_cols, columns = ['column_name','% top repetitve value']).sort_values(by = '% top repetitve value', ascending = False).reset_index(drop=True) rep_col_df df.shape",No,3,10.0 "cat_cols = df.select_dtypes(exclude=['number']).columns.values drop_cols = [] keep_cols = [] for col in cat_cols: if df[col].value_counts().count() > 20000 : print('column {} has {} categories > drop'.format(col,df[col].value_counts().count())) drop_cols.append(col) else : print('column {} has {} categories > keep'.format(col,df[col].value_counts().count())) keep_cols.append(col)",No,4,10.0 "# Binary Encoding import category_encoders as ce encoder = ce.BinaryEncoder(cols = keep_cols) bi_enc_df = encoder.fit_transform(df[keep_cols]) bi_col_name = bi_enc_df.columns bi_enc_df.head() #Add Binary Encding to dataframe and drop all categorical columns df = pd.concat([df,bi_enc_df],axis = 1) df.head()",No,3,20.0 "# Add a 'loan_status' collumn which 1 represents default loan and 0 represents not default loan. df['loan_status'] = np.where(df['loss'] > 0, 1, 0) df.head()",No,5,8.0 "# After generate a visualization from loan_status in dataframe. # We found that the data is imbalance. ax = sns.countplot(x = 'loan_status', data=df) plt.show() df['loan_status'].value_counts()",No,5,33.0 "from sklearn.model_selection import train_test_split X = resample_df.drop(columns = ['loss','loan_status']) Y = resample_df['loss'] X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.2, random_state = 1234, stratify = resample_df['loan_status']) print('training set = {} records, test set= {} records'.format(X_train.shape[0],X_test.shape[0]))",No,4,21.0 "from sklearn.feature_selection import SelectPercentile , SelectKBest, f_regression , f_classif",No,5,22.0 "#Select top 170 important numerical columns with filter method X_train_num = X_train.drop(columns = bi_col_name) selector = SelectKBest(score_func = f_regression, k = 170) selector.fit(X_train_num,Y_train) select_cols = selector.get_support(indices = True) select_num_cols = X_train_num.iloc[:,select_cols] select_num_col_name = select_num_cols.columns select_num_cols.head()",No,5,86.0 "#Select top 150 important numerical columns with RFE from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression selector = RFE(LogisticRegression(), n_features_to_select=150, step=1, verbose = 2) selector = selector.fit(select_num_cols, Y_train) select_cols = selector.get_support(indices = True) select_cols_df = select_num_cols.iloc[:,select_cols] best_X_col_name = select_cols_df.columns select_cols_df.head()",No,5,86.0 "# Select top 5 important categorical columns with filter method from sklearn.preprocessing import OrdinalEncoder from sklearn.preprocessing import LabelEncoder X_train_cat = X_train.select_dtypes(exclude = 'number').copy() # Create encoder le = LabelEncoder() X_train_cat = X_train_cat.apply(lambda col: le.fit_transform(col.astype(str)), axis=0, result_type='expand') # Prepare input data oe = OrdinalEncoder() oe.fit(X_train_cat) X_train_cat_enc = oe.transform(X_train_cat) selector = SelectKBest(score_func = f_classif , k=5) selector.fit(X_train_cat_enc,Y_train) select_cols = selector.get_support(indices = True) select_cat_cols = X_train_cat.iloc[:,select_cols] select_cat_col_name = select_cat_cols.columns select_cat_cols.head()",No,3,86.0 "#Combine categorical and non-categorical dataframe together def filter_x_df(x): df = x.copy() all_filter_col = [] for keep in select_cat_col_name[select_cat_col_name.isin(keep_cols)]: filter_col = [col for col in df.columns if col.startswith(str(keep))] for col in filter_col : if col not in keep_cols: all_filter_col.append(col) drop_cat_df = df.drop(columns = cat_cols) new_df = pd.concat([drop_cat_df[best_X_col_name],drop_cat_df[all_filter_col]],axis = 1) return new_df",No,5,11.0 filter_X_train.head(),No,5,41.0 "from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(filter_X_train) X_train_scal = scaler.fit_transform(filter_X_train) X_test_scal = scaler.fit_transform(filter_X_test)",No,5,18.0 "from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier() neigh.fit(X_train_scal, Y_train) Knn_y_pred_train = neigh.predict(X_train_scal) Knn_y_pred_test = neigh.predict(X_test_scal) scores_kn = cross_val_score(estimator = neigh, y = Y_train, X = X_train_scal, cv=5) print('Cross Validation Score:', np.mean(scores_kn))",Yes,4,7.0 "from sklearn.linear_model import LogisticRegression logisticRegr = LogisticRegression() logisticRegr = logisticRegr.fit(X_train_scal, Y_train) Lr_y_pred_train = logisticRegr.predict(X_train_scal) Lr_y_pred_test = logisticRegr.predict(X_test_scal) scores_lr = cross_val_score(estimator = logisticRegr, y = Y_train, X = X_train_scal, cv=5) print('Cross Validation Score:', np.mean(scores_lr))",Yes,4,7.0 "from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(max_depth = 70) rf.fit(filter_X_train, Y_train) rf_y_pred_train = rf.predict(filter_X_train) rf_y_pred_test = rf.predict(filter_X_test) scores_rf = cross_val_score(estimator = rf, y = Y_train, X = filter_X_train, cv=5) print('Cross Validation Score:', np.mean(scores_rf))",Yes,4,7.0 "from xgboost import XGBClassifier xgb = XGBClassifier(gamma=0, learning_rate=0.1, max_depth=100, n_estimators=100) xgb.fit(filter_X_train,Y_train) xgb_y_pred_train = xgb.predict(filter_X_train) xgb_y_pred_test = xgb.predict(filter_X_test) scores_xg = cross_val_score(estimator = rf, y = Y_train, X = filter_X_train, cv=5) print('Cross Validation Score:', np.mean(scores_xg))",Yes,4,7.0 "# Split Train Set & Test Set from sklearn.model_selection import train_test_split Y = X_sm['loss'] X = X_sm.drop(columns = 'loss') X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.2, random_state = 1234, stratify = Y_sm) print('training set = {} records, test set= {} records'.format(X_train.shape[0],X_test.shape[0]))",No,4,13.0 "# Select top 170 important numerical columns with filter method X_train_num = X_train.drop(columns = bi_col_name) selector = SelectKBest(score_func = f_regression, k = 170) selector.fit(X_train_num,Y_train) select_cols_sm = selector.get_support(indices = True) select_num_cols_sm = X_train_num.iloc[:,select_cols_sm] select_num_col_name = select_num_cols_sm.columns select_num_cols_sm.head()",No,5,86.0 "# Select top 150 important numerical columns with RFE from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression selector = RFE(LogisticRegression(), n_features_to_select=150, step=1, verbose = 2) selector = selector.fit(select_num_cols_sm, Y_train) select_cols_sm = selector.get_support(indices = True) select_cols_df_sm = select_num_cols_sm.iloc[:,select_cols_sm] best_X_col_name_sm = select_cols_df_sm.columns select_cols_df_sm.head()",No,5,86.0 "# Combine categorical and non-categorical dataframe together def filter_x_df_sm(x): df = x.copy() all_filter_col = [] for keep in select_cat_col_name[select_cat_col_name.isin(keep_cols)]: filter_col = [col for col in df.columns if col.startswith(str(keep))] for col in filter_col : if col not in keep_cols: all_filter_col.append(col) new_df = pd.concat([df[best_X_col_name_sm],df[all_filter_col]],axis = 1) return new_df",No,5,11.0 "#Standardize scaler = StandardScaler() scaler.fit(filter_X_train_sm) X_train_scal_sm = scaler.fit_transform(filter_X_train_sm) X_test_scal_sm = scaler.fit_transform(filter_X_test_sm)",No,5,18.0 "# Train Logistic Regression model logisticRegr_sm = LogisticRegression() logisticRegr_sm = logisticRegr_sm.fit(X_train_scal_sm, Y_train) Lr_y_pred_train_sm = logisticRegr_sm.predict(X_train_scal_sm) Lr_y_pred_test_sm = logisticRegr_sm.predict(X_test_scal_sm) scores_lr_sm = cross_val_score(estimator = logisticRegr_sm, y = Y_train, X = X_train_scal_sm, cv=5) print('Cross Validation Score:', np.mean(scores_lr_sm))",Yes,4,7.0 "cat_cols = df.select_dtypes(exclude=['number']).columns.values for col in cat_cols: if df[col].value_counts().count() > 20000 : print('Column {} has {} categories'.format(col,df[col].value_counts().count()))",No,5,54.0 "b""featurename = filter_X_train.columns\nimportances = list(rf.feature_importances_)\n\nfeature_importances = [(feature, round(importance, 3)) for feature, importance in zip(featurename, importances)]\nfeature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)\n\nprint('Top 50 Importance Features\\n')\n[print('Variable: {} Importance Score: {}'.format(*pair)) for pair in feature_importances[:50]];""",No,5,86.0 "test_df = pd.read_csv('../input/loan-default-prediction/test_v2.csv.zip') test_df.head()",No,5,45.0 "# Binary Encoding encoder = ce.BinaryEncoder(cols = keep_cols) bi_enc_df = encoder.fit_transform(test_df[keep_cols]) bi_col_name = bi_enc_df.columns test_df = pd.concat([test_df,bi_enc_df],axis = 1) test_df.head()",Yes,4,20.0 "# Create select_test_df by drop some columns in test_df select_test_df = pd.concat([test_df['id'],test_df[filter_X_train.columns]],axis = 1) select_test_df.head()",No,4,11.0 "# Check missing value select_test_df.isnull().sum().value_counts()",No,5,39.0 "# Impute missing value in numeric columns with median numeric_cols = select_test_df.select_dtypes(include=['number']).columns.values for col in numeric_cols: if col in list(col_pct_miss_df.column_name): med = df[col].median() select_test_df[col] = select_test_df[col].fillna(med) not_numeric_cols = select_test_df.select_dtypes(exclude=['number']).columns.values for col in not_numeric_cols: mode = df[col].mode() select_test_df[col] = select_test_df[col].fillna(mode[0])",No,5,17.0 "# Find columns that contain missing value nan_columns = select_test_df.isna().any() columns_with_nan = select_test_df.columns[nan_columns].tolist() columns_with_nan",No,3,71.0 "# Replace missing value with zero select_test_df[columns_with_nan] = select_test_df[columns_with_nan].fillna(0)",No,5,17.0 select_test_df.head(),No,5,41.0 "# Random Forest test_df_rf = test_df.copy() test_df_rf['loss'] = rf.predict(select_test_df.loc[:,select_test_df.columns != 'id']) test_df_rf.head()",No,5,48.0 "# Export sample_submission of random forest sample_submission = test_df_rf[['id','loss']] sample_submission.to_csv('sample_submission_rf.csv', index = False)",No,5,25.0 "import pandas as pd import numpy as np import matplotlib.pyplot as plt from matplotlib import rcParams import seaborn as sb from collections import Counter import warnings warnings.filterwarnings(""ignore"") from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier from sklearn.naive_bayes import GaussianNB,MultinomialNB from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from xgboost import XGBClassifier from sklearn.linear_model import SGDClassifier from sklearn.naive_bayes import BernoulliNB from xgboost import XGBClassifier from sklearn.preprocessing import LabelEncoder,normalize,MinMaxScaler from sklearn.metrics import f1_score from sklearn.model_selection import train_test_split,cross_val_score from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve import seaborn as sns",No,5,23.0 "import tensorflow as tf # GPU device Check. device_name = tf.test.gpu_device_name() if device_name == '/device:GPU:0': print('Found GPU at: {}'.format(device_name)) else: raise SystemError('GPU device not found')",Yes,4,22.0 "# Reading data train = pd.read_csv('../input/higgs-boson/training.zip') test = pd.read_csv('../input/higgs-boson/test.zip')",No,5,45.0 "b""print(train.columns.values,'\\n')\nprint(test.columns.values)""",No,5,71.0 "train = train.drop(['Weight'], axis=1)",No,5,10.0 "print(train['Label'].value_counts()) rcParams['figure.figsize'] = 10,5 sb.barplot(x = train['Label'].value_counts().index, y = train['Label'].value_counts().values) plt.title('Label counts') plt.show()",No,5,33.0 "# getting dummy variables column enc = LabelEncoder() train['Label'] = enc.fit_transform(train['Label']) train.head()",No,4,20.0 "y = train[""Label""] X = train X_test = test",No,5,21.0 "X.set_index(['EventId'],inplace = True) X_test.set_index(['EventId'],inplace = True) X = X.drop(['Label'], axis=1) X.head()",No,4,21.0 X_test.head(),No,5,41.0 "#Normalizing from sklearn.preprocessing import normalize X = normalize(X) X_test = normalize(X_test)",No,5,18.0 "b""# print(X.isnull().sum(),'\\n')\n# print(X_test.isnull().sum())""",No,5,53.0 "b""#print(X.isnull().sum(),'\\n')\n#print(X_test.isnull().sum())""",No,5,53.0 "import pandas as pd from sklearn import ensemble",No,5,22.0 "# The competition datafiles are in the directory ../input file_train = ""../input/train.csv"" file_test = ""../input/test.csv"" df_train = pd.read_csv(file_train) df_test = pd.read_csv(file_test) df_train.head()",Yes,3,45.0 "feature_cols = [col for col in df_train.columns if col not in ['Cover_Type','Id']] X_train = df_train[feature_cols] X_test = df_test[feature_cols] y = df_train['Cover_Type'] # target test_ids = df_test['Id'] # for submission",No,5,21.0 "clf = ensemble.RandomForestClassifier(n_estimators=200,n_jobs=-1,random_state=0) clf.fit(X_train, y)",Yes,3,4.0 "file_submission = ""rf200.submission.csv"" with open(file_submission, ""w"") as outfile: outfile.write(""Id,Cover_Type\ "") for e, val in enumerate(list(clf.predict(X_test))): outfile.write(""%s,%s\ ""%(test_ids[e],val))'",No,5,25.0 "import time import pandas as pd from sklearn.cross_validation import train_test_split from sklearn import ensemble from sklearn.metrics import accuracy_score import numpy as np loc_test = ""../input/test.csv"" loc_train = ""../input/train.csv"" loc_submission = ""forest-cover-type-prediction.AspiringGuru.csv"" df_test = pd.read_csv(loc_test) df_train = pd.read_csv(loc_train) print (""type(df_test)="", type(df_test), ""df_test.shape="", df_test.shape) print (""type(df_train)="", type(df_train), ""df_train.shape="", df_train.shape) #build list of all columns except the ones we don't want. # ('Cover_Type' is the predicted value), 'Id' is a unique row identifier feature_cols = [col for col in df_train.columns if col not in ['Cover_Type', 'Id']] #create dataframe of the columns desired from the input data for test and train X_train = df_train[feature_cols] X_test = df_test[feature_cols] #create dataframe of the predicted value to use for building classifier train_y = df_train['Cover_Type'] # test_ids = df_test['Id'] #test_y = df_test['Cover_Type'] del df_train del df_test print (""creating classifier"") start_time = time.time() clf = ensemble.RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=0) print(""--- time to build ensemble.RandomForestClassifier %s seconds ---"" % (time.time() - start_time)) #http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html #n_jobs=-1 : the number of jobs is set to the number of cores.(runs faster) #n_estimators = The number of trees in the forest. print (""fitting from train data"") start_time = time.time() clf.fit(X_train, train_y) print(""--- time to clf.fit %s seconds ---"" % (time.time() - start_time)) print (""predicting from train data"") start_time = time.time() train_y_predicted = clf.predict(X_train) print(""--- time to clf.predict %s seconds ---"" % (time.time() - start_time)) print (""type(train_y_predicted)="", type(train_y_predicted), ""len(train_y_predicted)"", len(train_y_predicted), ""train_y_predicted.shape"", train_y_predicted.shape) print (""type(train_y)="", type(train_y), ""train_y.shape="", train_y.shape) print (""train_y = "", list(train_y[0:20, ]) ) print (""train_y_predicted = "", list(train_y_predicted[0:20, ]) ) print (""predicting from test data"") start_time = time.time() predicted = clf.predict(X_test) print (""--- time to clf.predict %s seconds ---"" % (time.time() - start_time)) print (""type(predicted)="", type(predicted), ""len(predicted)"", len(predicted) ) print (""type(test_ids)="", type(test_ids), ""len(test_ids)="", len(test_ids) ) print (""calculating accuracy_score on train data."") start_time = time.time() score = accuracy_score(train_y, train_y_predicted) print (""--- time to calcualte accuracy_score %s seconds ---"" % (time.time() - start_time)) print (""type(score)="", type(score), ""score="", score ) #for i in range(len(predicted)): print (""clf.predicting & writing to file"") start_time = time.time() with open(loc_submission, ""w"") as outfile: outfile.write(""Id,Cover_Type\ "") for e, val in enumerate(list(clf.predict(X_test))): outfile.write(""%s,%s\ "" % (test_ids[e], val)) print(""--- time to clf.predict & write to file %s seconds ---"" % (time.time() - start_time)) '",Yes,1,22.0 "df=pd.read_csv(""../input/train.csv"") test=pd.read_csv(""../input/test.csv"") y=df[""Cover_Type""] x=df.iloc[:,:-1] id=df.iloc[:,:1]",Yes,2,22.0 "from sklearn.tree import DecisionTreeClassifier reg=DecisionTreeClassifier() reg.fit(x,y)",Yes,2,4.0 pred=reg.predict(test),No,5,48.0 "mysubmission=pd.DataFrame({'Id':test.Id,'Cover_Type':pred})",No,5,55.0 "mysubmission.to_csv(""submission.csv"",index=False)",No,5,25.0 "temp = pd.read_csv(""submission.csv"") temp",No,5,45.0 "from IPython.display import display import matplotlib.pyplot as plt import seaborn as sns",No,5,22.0 "train_set = pd.read_csv('../input/train.csv') test_set = pd.read_csv('../input/test.csv')",No,5,45.0 "display(train_set.head()) display(train_set.describe())",No,4,40.0 "display(train_set.keys()) display(len(train_set.keys()))",No,5,40.0 "# How about using this features directly? (Not using the scaling and normalization) fig = plt.figure() fig.set_size_inches(35, 35) sns.set(font_scale=2) # Delete 'Id' and change cover type to dummy variables cont_var_train_set = train_set.drop('Id', axis=1).drop(cate_vars, axis=1) # Categorical feature : cannot using correlation directly. cont_var_train_set_dum = pd.get_dummies(cont_var_train_set, columns=['Cover_Type']) correlation = cont_var_train_set_dum.corr() sns.heatmap(correlation, cmap='viridis', annot=True, linewidths=3)",Yes,1,80.0 from sklearn.preprocessing import StandardScaler,No,5,22.0 "# using scaler scaler = StandardScaler() scaler.fit(scaled_feat) scaled_feat = scaler.transform(scaled_feat) scaled_feat = pd.DataFrame(scaled_feat, columns=cont_vars) scaled_feat.head()",Yes,1,4.0 "fig = plt.figure() fig.set_size_inches(35, 35) correlation2 = pd.concat([scaled_feat, dummy_labels], axis=1).corr() sns.heatmap(correlation2, cmap='viridis', annot=True, linewidths=3)",Yes,2,11.0 "from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report",No,5,22.0 "# Spliting the datasets features = pd.concat([scaled_feat, train_set[cate_vars]], axis=1) features.head()",Yes,2,11.0 "rf_model = RandomForestClassifier(max_depth=7, n_estimators=300) rf_model.fit(x_train, y_train)",Yes,2,4.0 "# Predicting naively pred = rf_model.predict(x_test) display(accuracy_score(y_test, pred)) display(classification_report(y_test, pred))",Yes,3,48.0 "# See the importance of features importances = rf_model.feature_importances_ indices = np.argsort(importances) fig = plt.figure() fig.set_size_inches(20, 20) sns.set(font_scale=1.5) plt.title('Feature Importances') plt.barh(range(len(indices)), importances[indices], color='b', align='center') plt.yticks(range(len(indices)), features.keys()[indices]) plt.xlabel('Relative Importance')",Yes,3,79.0 "# dimensional reduction from sklearn.decomposition import PCA import numpy as np pca = PCA(n_components=None, random_state=20180425) pca.fit(features)",Yes,2,22.0 "pca_var = pca.explained_variance_ratio_ fig, ax = plt.subplots(1, 2, figsize=(16, 8)) ax1, ax2 = ax.flatten() ax1.plot(pca_var) ax2.plot(np.cumsum(pca_var))",No,5,33.0 train_set.head(),No,5,41.0 "wilderness_area_col = train_set['Wilderness_Area'].astype(int) soil_type_col = train_set['Soil_Type'].astype(int) display(wilderness_area_col.head()) display(soil_type_col.head())",Yes,3,16.0 import scipy.stats as ss,No,5,22.0 "cate_vars_1 = ['Wilderness_Area', 'Soil_Type']",No,5,77.0 "input_features = pd.concat([scaled_feat, wilderness_area_col, soil_type_col], axis=1) labels = train_set['Cover_Type'] display(input_features.head()) display(labels.head())",Yes,1,11.0 "x_train, x_test, y_train, y_test = train_test_split(input_features, labels, random_state=20190501, test_size=0.3)",No,5,13.0 test_set_rf = test_set.copy(),No,5,77.0 "test_set_rf_cont = test_set_rf[cont_vars] scaler.fit(test_set_rf_cont) test_set_rf_cont = scaler.transform(test_set_rf_cont) test_set_rf_cont = pd.DataFrame(test_set_rf_cont, columns=cont_vars) test_set_rf_cate = test_set_rf[cate_vars] scaled_test_set_rf = pd.concat([test_set_rf_cont, test_set_rf_cate], axis=1) scaled_test_set_rf.head()",Yes,2,12.0 "rf_pred = rf_model.predict(scaled_test_set_rf) rf_result = pd.concat([test_set['Id'], pd.DataFrame({'Cover_Type': rf_pred})], axis=1) rf_result.to_csv(""rf_submission.csv"", index=False)'",Yes,3,48.0 "# 1. scaling the continous features test_cont_feat = test_set_copy[cont_vars] scaler.fit(test_cont_feat) test_scaled_cont_feat = scaler.transform(test_cont_feat) test_scaled_cont_feat = pd.DataFrame(test_scaled_cont_feat, columns=cont_vars) # 2. categorical features test_cate_feat = test_set_copy[cate_vars_1].astype(int) # 3. concat test_input_features = pd.concat([test_scaled_cont_feat, test_cate_feat], axis=1)",Yes,1,12.0 "display(test_cont_feat.head()) display(test_scaled_cont_feat.head()) display(test_input_features.head())",No,5,41.0 "result = pd.concat([test_set['Id'], pd.DataFrame({'Cover_Type': result})], axis=1)",Yes,4,11.0 result.head(),No,5,41.0 "result.to_csv(""submission.csv"", index=False)",No,5,25.0 "# -*- coding: utf-8 -*- """""" Cincia de Dados e Visualizao com Python Exemplo: Forest Cover Type URL (problema): https://www.kaggle.com/c/forest-cover-type-prediction URL (soluo): https://www.kaggle.com/ivarvb/forest-cover-type Autor: Ivar Vargas Belizario E-mail: ivar@usp.br """""" import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import KFold from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.metrics import accuracy_score from sklearn.manifold import TSNE from matplotlib import pyplot as plt """""" =================================================== I. Cincia de dados =================================================== =================================================== 1. Leitura dos datos para o treino e para o teste =================================================== """""" train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') """""" =================================================== 2. Pre-processamento =================================================== =================================================== 2.1 Limpeza e amostragem =================================================== """""" train = train.fillna(0) test = test.fillna(0) # definir as colunas da etiqueta da classe (target) e do identificador (id) column_target = 'Cover_Type' column_id = 'Id' # limpar os atributos que apresentam valores nulls data = train.dropna(axis='columns') # nmero de instancias antes da amostragem print (""Total data: "",len(data)) # separao dos atributos: identificador da instancia (id) # dos atributos data (X) e do atributo que contem a etiqueta da classe (y) X = data y = data[column_target] # porcentagem para a amostragem c_sample = 0.99 # amostragem if c_sample < 1.0: X_null, X, y_null, y = train_test_split(X, y, test_size=c_sample, random_state=0) ID = X[column_id] y = X[column_target] X = X.drop([column_id, column_target], axis=1).select_dtypes(include=[np.number]) train_select_atributes = X.columns print (""Amostragem: "",len(X)) """""" =================================================== 2. Processamento =================================================== =================================================== 2.1 Reduo da dimensionalidade (feature selection) =================================================== """""" """""" model = ExtraTreesClassifier() model.fit(X, y) imp = model.feature_importances_ names = [] for i in range(len(imp)): r = [] r.append(i) r.append(imp[i]) names.append(r) names = sorted(names, key=lambda x: x[1], reverse=True) fenames = [] columns = list(set(train_select_atributes)) for i in range(len(names)): fenames.append(columns[names[i][0]]) train_select_atributes = fenames[:30] X = X[train_select_atributes].values y = y.values """""" # convertir para arrays X = X.values y = y.values """""" =================================================== 3. Modelo de aprendisagem (aprendisagem supervisionado): =================================================== =================================================== 3.1. Treinamento: =================================================== """""" # definir o modelo para a classificao model = RandomForestClassifier(random_state=0, n_estimators=500) # modelo de treinamento com k-fold (10-fold) kf = StratifiedKFold(n_splits=10) outcomes = [] # para cada fold for train_index, test_index in kf.split(X, y): Xtrain, Xtest = X[train_index], X[test_index] ytrain, ytest = y[train_index], y[test_index] model.fit(Xtrain, ytrain) expected = ytest predictions = model.predict(Xtest) accuracy = accuracy_score(ytest, predictions) outcomes.append(accuracy) # imprimir a media da acuracia obtida no treinamento mean_outcome = np.array(outcomes).mean() print (""Mean Accuracy:"", mean_outcome) """""" =================================================== 3.2. Teste: =================================================== """""" # seleco de atributos igual ao feito com o conjunto de treino X_test = test[train_select_atributes] x_test_id = test[column_id] predictions = model.predict(X_test) predictions = pd.DataFrame(predictions, columns = [""Cover_Type""]) # salvar resultados obtidos do conjunto de dados de teste result = pd.concat([x_test_id, predictions], axis=1, sort=False) result.to_csv(""result.csv"", mode = 'w', index=False) """""" =================================================== II. Visualizao do conjunto de dados (projees) =================================================== """""" #print (y) isfineClass = False for i in range(len(y)): if y[i]==0: isfineClass=True break; if isfineClass==False: for i in range(len(y)): v = y[i] y[i] = v-1 # amostragem para a visualizao c_sample = 0.1 if c_sample < 1.0: X_null, X, y_null, y = train_test_split(X, y, test_size=c_sample, random_state=0) print (""Amostragem para a visualizao: "", len(X)) # visualizao por projees t-SNE tsne = TSNE(n_components=2, random_state=0) X_2d = tsne.fit_transform(X) plt.figure(figsize=(6, 5)) colors = [""#1f77b4"", ""#ff7f0e"", ""#2ca02c"", ""#d62728"", ""#9467bd"", ""#8c564b"", ""#e377c2"", ""#7f7f7f"", ""#bcbd22"", ""#17becf""] for i in range(len(y)): v = y[i] plt.plot(X_2d[i, 0], X_2d[i, 1], 'o', color=colors[v], alpha=0.3) # visualiar a projeo plt.show() '",No,2,22.0 "import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt import seaborn as sns",Yes,4,22.0 "#data imports data_train = pd.read_csv(""../input/train.csv"") data_test = pd.read_csv(""../input/test.csv"") data_train.head()",Yes,4,45.0 "keras.backend.clear_session() model = keras.models.Sequential() model.add(keras.layers.Conv2D(64, (3, 3), activation='relu', input_shape=(32, 32, 3))) model.add(keras.layers.BatchNormalization()) model.add(keras.layers.Conv2D(64, (2, 2), activation='relu',padding='same')) model.add(keras.layers.MaxPooling2D(1, 1)) model.add(keras.layers.Dropout(0.1)) model.add(keras.layers.BatchNormalization()) model.add(keras.layers.Conv2D(64, (3, 3), activation='relu',padding='same')) model.add(keras.layers.MaxPooling2D(2, 2)) model.add(keras.layers.Dropout(0.2)) model.add(keras.layers.BatchNormalization()) model.add(keras.layers.Conv2D(64, (2, 2), activation='relu',padding='same')) model.add(keras.layers.MaxPooling2D(1, 1)) model.add(keras.layers.Dropout(0.1)) model.add(keras.layers.Flatten()) model.add(keras.layers.Dense(64, activation='relu')) model.add(keras.layers.BatchNormalization()) model.add(keras.layers.Dropout(0.2)) model.add(keras.layers.Dense(10, activation=""softmax"")) model.compile(loss=keras.losses.CategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.RMSprop(lr=0.001, decay = 1e-3, momentum = 0.3), metrics=['accuracy']) model.input '",No,5,4.0 " history = model.fit(train_generator, steps_per_epoch=(train_size//batch_size), epochs= 5, validation_data=validation_generator, validation_steps=(validation_size//batch_size) )",No,5,7.0 test_labels.head(2),No,5,41.0 "if os.path.exists(""./test""): shutil.rmtree(""./test"") if os.path.exists(""./train""): shutil.rmtree(""./train"") if not os.path.exists(""./data""): os.mkdir(""./data"") shutil.unpack_archive('/kaggle/input/cifar-10/test.7z', '/kaggle/working/data')'",Yes,2,88.0 "test_dir = os.listdir(""./data/test""); test_dir_len = len(test_dir) print('min:\\t',min(test_dir)) print('max:\\t',max(test_dir)) print("".\\\\test:\\t"",test_dir_len) print(""files:\\t\\t"",test_dir[:3])'",No,3,88.0 "test_data_generator = ImageDataGenerator(rescale=1./255.) test_generator = test_data_generator.flow_from_directory(directory='/kaggle/working/data', batch_size=batch_size, shuffle=False,color_mode='rgb', target_size=(32,32), class_mode=None)",No,5,84.0 predict_test = model.predict_generator(test_generator),No,5,48.0 "predict_generator = np.argmax(predict_test, axis=1) print(class_names) predict_generator[:2],[class_names[int(i)] for i in predict_generator[:2]]",No,4,14.0 "submission = pd.DataFrame(columns = ['id','label'],dtype=str) submission[""label""] = [class_names[int(i)] for i in predict_generator] submission[""id""] = [ (''.join(filter(str.isdigit, name ))) for name in test_generator.filenames] submission.head(101)'",Yes,4,12.0 submission.values[50:100],No,5,41.0 "index = 0 fig = plt.figure(figsize = (16,10)) for item in submission.values[50:70]: index += 1 plt.subplot(5, 5, index) test_path = '/kaggle/working/data/test/'+item[0]+'.png' test_image = load_img(test_path, target_size=(32,32)) plt.imshow(test_image) plt.colorbar() plt.grid(False) plt.axis(""off"") plt.title(item[1]) plt.show()'",No,5,56.0 " shutil.rmtree(""./data"")",No,5,84.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory import matplotlib.pyplot as plt import seaborn as sns import os from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression,SGDClassifier from sklearn.tree import ExtraTreeClassifier from sklearn.svm import SVC from statistics import variance from sklearn.feature_selection import VarianceThreshold print(os.listdir(""../input"")) # Any results you write to the current directory are saved as output.",Yes,5,88.0 "test = pd.read_csv(""../input/test.csv"") train = pd.read_csv(""../input/train.csv"")",No,5,45.0 "train.sample() ",No,5,41.0 "#check for missing values train.info()",No,5,40.0 "sns.heatmap(train.isnull(),cbar = False)",No,5,80.0 "distance=pd.DataFrame(train,columns = ['Horizontal_Distance_To_Hydrology','Horizontal_Distance_To_Roadways', 'Hillshade_Noon','Horizontal_Distance_To_Fire_Points']) for column in distance: plt.figure() distance.boxplot([column])",Yes,4,12.0 "#Cover type is the target to be predicted. #Train test split x_train,x_test,y_train,y_test= train_test_split(train.drop('Cover_Type',axis = 1),train['Cover_Type'],test_size = 0.3,random_state = 17)",No,5,13.0 "#Building logistic regression model logreg = LogisticRegression() logreg.fit(x_train,y_train)",Yes,5,7.0 "#Predicting logistic regression results logreg.predict(x_test)",No,5,48.0 "#Logistic regression test scores score = logreg.score(x_test, y_test) print(score)",No,5,49.0 "#Random Forest from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier",No,5,22.0 "tree_model = DecisionTreeClassifier() ensemble_model = RandomForestClassifier() ",No,5,4.0 "tree_model.fit(x_train,y_train)",No,5,7.0 "ensemble_model.fit(x_train,y_train)",No,5,7.0 "tree_predict=tree_model.predict(x_test) tree_model.score(x_test,y_test)",Yes,4,48.0 "ensemble_predict= ensemble_model.predict(test) print (ensemble_predict) ensemble_model.score(x_test,y_test)",Yes,4,48.0 submission.shape,No,5,58.0 "x_test.shape test.shape ",No,5,58.0 "#current public score is 0.66,this should be improved #checking the variance of each feature train1 = train test1 = test sel = VarianceThreshold(threshold=(.8 * (1 - .8))) sel.fit_transform(train1) train1.head(40) #this is based on this article ,https://scikit-learn.org/stable/modules/feature_selection.html #could see no rows being removed in the data set,as all of them have valid values,non null. ",Yes,3,86.0 tree_model.fit,No,5,7.0 "tree_model.predict(test1) tree_model.score(x_test,y_test)",Yes,4,48.0 "pd.DataFrame([train.mean(), train.std(), train.var()], index=['Mean', 'Std. dev', 'Variance']) ",No,3,12.0 "x=pd.DataFrame(ensemble_model.feature_importances_, index=x_train.columns, columns=['Importance']).sort_values( by='Importance', ascending=False)[:10] print(x)",No,4,79.0 "#Modelling based on important features alone train2 = train test2 = test train_imp = train2[['Id','Elevation','Horizontal_Distance_To_Roadways','Horizontal_Distance_To_Fire_Points', 'Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Hillshade_9am', 'Aspect','Hillshade_3pm', 'Wilderness_Area4','Cover_Type']] ",No,5,12.0 "x_train_imp,x_test_imp,y_train_imp,y_test_imp= train_test_split(train_imp.drop('Cover_Type',axis = 1),train_imp['Cover_Type'], test_size = 0.3,random_state = 17) ",No,5,13.0 "logreg1 = LogisticRegression() logreg1.fit(x_train_imp,y_train_imp) logreg1.predict(x_test_imp) logreg1.score(x_test_imp,y_test_imp)",Yes,3,7.0 "tree_model1 =DecisionTreeClassifier() tree_model1.fit(x_train_imp,y_train_imp) tree_predict=tree_model1.predict(x_test_imp) tree_model1.score(x_test_imp,y_test_imp)",Yes,3,7.0 "ensemble_1 = RandomForestClassifier() ensemble_1.fit(x_train_imp,y_train_imp) ensemble_predict= ensemble_1.predict(x_test_imp) print (ensemble_predict) ensemble_1.score(x_test_imp,y_test_imp)",Yes,3,7.0 "pd.DataFrame(tree_model.feature_importances_,index = x_train.columns,columns=['Importance']).sort_values( by = 'Importance',ascending = False)[:10]",No,4,79.0 "#Modelling based on important features alone train = train.drop([""Soil_Type7"",""Soil_Type15"",""Wilderness_Area1"",""Wilderness_Area2"",""Wilderness_Area3"",""Slope"", ""Hillshade_Noon""],axis = 1) test = test.drop([""Soil_Type7"",""Soil_Type15"",""Wilderness_Area1"",""Wilderness_Area2"",""Wilderness_Area3"",""Slope"", ""Hillshade_Noon""],axis = 1) train3 = train test3 = test train[:10] #train_imp = train3[train] #[['Id','Elevation','Horizontal_Distance_To_Roadways','Horizontal_Distance_To_Fire_Points', # 'Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Hillshade_9am', # 'Aspect','Hillshade_3pm', # 'Wilderness_Area4','Cover_Type']]'",No,4,10.0 "x_train_3,x_test_3,y_train_3,y_test_3= train_test_split(train3.drop('Cover_Type',axis = 1),train3['Cover_Type'], test_size = 0.3,random_state = 17) ",No,4,13.0 "logreg2 = LogisticRegression() logreg2.fit(x_train_3,y_train_3) logreg2.predict(x_test_3) logreg2.score(x_test_3,y_test_3)",Yes,4,7.0 "tree_model2 =DecisionTreeClassifier() tree_model2.fit(x_train_3,y_train_3) tree_predict2=tree_model2.predict(x_test_3) tree_model2.score(x_test_3,y_test_3) tree_test_pred = tree_model2.predict(test)",Yes,3,7.0 "ensemble_2 = RandomForestClassifier() ensemble_2.fit(x_train_3,y_train_3) ensemble_predict2= ensemble_2.predict(x_test_3) print (ensemble_predict2) ensemble_2.score(x_test_3,y_test_3) ensemble_test_pred = ensemble_2.predict(test) ",Yes,4,7.0 "from sklearn.naive_bayes import GaussianNB nb = GaussianNB() nb.fit(x_train_3,y_train_3) nb.predict(x_test_3) nb.score(x_test_3,y_test_3)",Yes,3,7.0 "sgd = SGDClassifier(loss = 'modified_huber',shuffle = True,random_state = 171) sgd.fit(x_train_3,y_train_3) sgd.predict(x_train_3) sgd.score(x_test_3,y_test_3)",Yes,3,7.0 "sgd = SGDClassifier(loss = 'log',shuffle = True,random_state = 171) sgd.fit(x_train_3,y_train_3) sgd.predict(x_train_3) sgd.score(x_test_3,y_test_3)",Yes,4,7.0 "sgd = SGDClassifier(shuffle = True,random_state = 171) sgd.fit(x_train_3,y_train_3) sgd.predict(x_train_3) sgd.score(x_test_3,y_test_3)",Yes,3,7.0 "submission = pd.DataFrame({'Id':test.Id,'Cover_Type':ensemble_test_pred}) submission.head() submission.to_csv('submission.csv',index = False)",Yes,5,25.0 "submission_tree = pd.DataFrame({'Id':test.Id,'Cover_Type':tree_test_pred}) submission_tree.head() submission_tree.to_csv('submission2.csv',index = False)",Yes,5,25.0 "#Extra tree classifier is a tree based model for classification problems et = ExtraTreeClassifier() et.fit(x_train_3,y_train_3) et.predict(x_train_3) et.score(x_test_3,y_test_3)",Yes,3,7.0 "from sklearn.semi_supervised import LabelPropagation lb = LabelPropagation() lb.fit(x_train_3,y_train_3) lb.predict(x_train_3) lb.score(x_test_3,y_test_3)",Yes,3,7.0 "from sklearn.neighbors import KNeighborsClassifier knng =KNeighborsClassifier() knng.fit(x_train_3,y_train_3) knng.predict(x_train_3) knng.score(x_test_3,y_test_3)",Yes,3,7.0 "features_soil = ['Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40'] data_train[""Soil_Count""] = data_train[features_soil].apply(sum, axis=1) data_train.head()'",Yes,4,8.0 data_train.Soil_Count.describe(),No,5,40.0 data_test[features_soil].describe(),No,5,40.0 "data_train[""Soil_Type""] = data_train[features_soil].apply(np.argmax, axis=1) data_train.head()",Yes,4,8.0 "data_train[""Soil_Type""] = data_train[""Soil_Type""].apply(lambda x: x.split(""Soil_Type"")[-1]) data_train.head()",Yes,4,8.0 "features_wilderness = ['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3','Wilderness_Area4'] data_train[""Wilderness_Area""] = data_train[features_wilderness].apply(sum, axis=1) data_train.Wilderness_Area.describe()'",Yes,4,8.0 "data_train[""Wilderness_Area""] = data_train[features_wilderness].apply(np.argmax, axis=1) data_train[""Wilderness_Area""] = data_train[""Wilderness_Area""].apply(lambda x: x.split(""Wilderness_Area"")[-1]) data_train.Wilderness_Area.head()",Yes,4,8.0 "sns.countplot(data_train.Cover_Type) plt.show()",No,5,33.0 data_train.columns,No,5,71.0 "features = ['Elevation', 'Aspect', 'Slope','Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', ""Cover_Type""] sns.heatmap(data=data_train[features].corr(), annot=True, linecolor=""w"", fmt="".1"") plt.show()'",No,5,80.0 "#Import pandas, tensorflow e keras import pandas as pd import numpy import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn import preprocessing import tensorflow as tf from tensorflow.python.data import Dataset import keras from keras import regularizers from keras.utils import to_categorical from keras import models from keras import layers from keras import backend as K import os #for dirname, _, filenames in os.walk('/kaggle/input'): # for filename in filenames: # print(os.path.join(dirname, filename)) #Lettura dati df = pd.read_csv(""/kaggle/input/forest-cover-type-prediction/train.csv"") dfT = pd.read_csv(""/kaggle/input/forest-cover-type-prediction/test.csv"")'",Yes,5,45.0 "data_train = pd.read_csv(""../input/train.csv"") final_train = clear_dataset(data_train)",Yes,4,45.0 "#Selezioniamo le caratteristiche x = df[df.columns[1:55]] xT = dfT[dfT.columns[1:55]] #Selezioniamo le etichette (8) y = df.Cover_Type #Split data into train and test x_train, x_test, y_train, y_test = train_test_split(x, y , train_size = 0.7, random_state = 90)",Yes,4,13.0 "x_data = final_train.drop([""Cover_Type"", ""Id""], axis=1) y_data = final_train[""Cover_Type""]",Yes,4,10.0 "# Normalize Training Data scaler = preprocessing.StandardScaler() scaler.fit(x_train.values[:,0:10]) x_train_norm = scaler.transform(x_train.values[:,0:10]) x_test_norm = scaler.transform(x_test.values[:,0:10]) x_sub = scaler.transform(xT.values[:,0:10]) x_train_norm=numpy.concatenate((x_train_norm,x_train.values[:,10:]),axis=1) x_test_norm=numpy.concatenate((x_test_norm,x_test.values[:,10:]),axis=1) x_sub=numpy.concatenate((x_sub,xT.values[:,10:]),axis=1)",Yes,4,18.0 "modelF = models.Sequential() modelF.add(layers.Dense(32,name=""Layer_1"",activation='relu',input_dim=54,kernel_initializer='he_normal',kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.08))) modelF.add(layers.BatchNormalization()) modelF.add(layers.Dense(16,name=""Layer_2"",activation='relu')) modelF.add(layers.Dense(64,name=""Layer_22"",activation='relu')) modelF.add(layers.BatchNormalization()) modelF.add(layers.Dense(64,name=""Layer_23"",activation='relu')) modelF.add(layers.BatchNormalization()) modelF.add(layers.Dense(16,name=""Layer_4"",activation='relu')) modelF.add(layers.Dense(8,name=""Layer_5"",activation='softmax')) modelF.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) modelF.summary()'",No,5,4.0 "Net4 = modelF.fit( x_train_norm, y_train, epochs= 400, batch_size = 256, validation_data = (x_test_norm, y_test))",No,5,7.0 "_, train_acc = modelF.evaluate(x_train_norm, y_train, verbose=0) _, test_acc = modelF.evaluate(x_test_norm, y_test, verbose=0) print('Train: %.3f, Test: %.3f' % (train_acc, test_acc)) # plot loss during training plt.rcParams['figure.figsize'] = (12.0, 9.0) plt.subplot(211) plt.title('Loss') plt.plot(Net4.history['loss'], label='train') plt.plot(Net4.history['val_loss'], label='test') plt.legend() # plot accuracy during training plt.subplot(212) plt.title('Accuracy') plt.plot(Net4.history['acc'], label='train') plt.plot(Net4.history['val_acc'], label='test') plt.legend() plt.show()",Yes,5,35.0 "test_predictions=modelF.predict_classes(x_sub, batch_size=256, verbose=0)",No,5,48.0 "solutions = pd.DataFrame({'Id':dfT.Id, 'Cover_Type':test_predictions}) solutions.to_csv('submission.csv',index=False)",Yes,5,25.0 "#This Python 3 environment comes with many helpful analytics libraries installed #It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python #For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings('ignore') %matplotlib inline #Input data files are available in the ""../input/"" directory. #For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) #Any results you write to the current directory are saved as output.",Yes,4,88.0 "df_train = pd.read_csv('../input/forest-cover-type-prediction/train.csv') df_test = pd.read_csv('../input/forest-cover-type-prediction/test.csv')",No,5,45.0 df_train.dtypes,No,5,70.0 "pd.set_option('display.max_columns', None) df_train.describe()",No,4,40.0 "df_train = df_train.drop(['Soil_Type7', 'Soil_Type15'], axis = 1) df_test = df_test.drop(['Soil_Type7', 'Soil_Type15'], axis = 1) ",No,5,10.0 "df_train = df_train.iloc[:,1:] df_test = df_test.iloc[:,1:]",No,4,13.0 "size = 10 corrmat = df_train.iloc[:, :size].corr() f, ax = plt.subplots(figsize = (10,8)) sns.heatmap(corrmat, vmax = 0.8, square = True)",No,5,80.0 "data = df_train.iloc[:, :size] cols = data.columns #Running pearson coefficient for all combinations data_corr = data.corr() threshold = 0.5 corr_list = []",Yes,4,40.0 data_corr,No,4,40.0 "#sorting the highly correlated values for i in range(0, size): for j in range(i+1, size): if data_corr.iloc[i, j] >= threshold and data_corr.iloc[i, j]<1\\ or data_corr.iloc[i,j] < 0 and data_corr.iloc[i,j]<=-threshold: corr_list.append([data_corr.iloc[i,j],i,j])'",No,3,9.0 "#Sorting values s_corr_list = sorted(corr_list, key = lambda x: -abs(x[0])) #print the higher values for v, i, j in s_corr_list: print(""%s and %s = %.2f"" % (cols[i], cols[j], v))",Yes,5,9.0 "df_train.iloc[:, :10].skew()",No,4,40.0 "from sklearn.model_selection import train_test_split x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.25, random_state=42)",Yes,4,13.0 "for v, i, j in s_corr_list: sns.pairplot(data = df_train, hue = 'Cover_Type', size = 6, x_vars = cols[i], y_vars = cols[j]) plt.show()",No,5,33.0 "# A violin plot is a hybrid of a box plot and a kernel density plot, which shows peaks in the data. cols = df_train.columns size = len(cols) - 1 # We don't need the target attribute # x-axis has target attributes to distinguish between classes x = cols[size] y = cols[0:size] for i in range(0, size): sns.violinplot(data=df_train, x=x, y=y[i]) plt.show()",No,5,33.0 "from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report def get_metrics(y_test, y_predicted): # true positives / (true positives+false positives) precision = precision_score(y_test, y_predicted, pos_label=None, average='weighted') # true positives / (true positives + false negatives) recall = recall_score(y_test, y_predicted, pos_label=None, average='weighted') # harmonic mean of precision and recall f1 = f1_score(y_test, y_predicted, pos_label=None, average='weighted') # true positives + true negatives/ total accuracy = accuracy_score(y_test, y_predicted) return accuracy, precision, recall, f1",Yes,4,49.0 df_train.Wilderness_Area2.value_counts(),No,5,72.0 "from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(n_estimators=100, max_depth=19, max_features=11,n_jobs=-1, random_state=42) clf.fit(x_train, y_train) y_predicted = clf.predict(x_val)",Yes,2,4.0 "### Group one-hot encoded variables of a category into one single variable cols = df_train.columns r,c = df_train.shape # Create a new dataframe with r rows, one column for each encoded category, and target in the end new_data = pd.DataFrame(index= np.arange(0,r), columns=['Wilderness_Area', 'Soil_Type', 'Cover_Type']) # Make an entry in data for each r for category_id, target_value for i in range(0,r): p = 0; q = 0; # Category1_range for j in range(10,14): if (df_train.iloc[i,j] == 1): p = j-9 # category_class break # Category2_range for k in range(14,54): if (df_train.iloc[i,k] == 1): q = k-13 # category_class break # Make an entry in data for each r new_data.iloc[i] = [p,q,df_train.iloc[i, c-1]] # plot for category1 sns.countplot(x = 'Wilderness_Area', hue = 'Cover_Type', data = new_data) plt.show() # Plot for category2 plt.rc(""figure"", figsize = (25,10)) sns.countplot(x='Soil_Type', hue = 'Cover_Type', data= new_data) plt.show()'",Yes,3,33.0 "from xgboost import XGBClassifier clf = XGBClassifier(n_estimators=200, learning_rate=0.3, max_depth=3,n_jobs=-1, seed=42, objective=""multi:softmax"") clf.fit(x_train, y_train) y_predicted = clf.predict(x_val) accuracy, precision, recall, f1 = get_metrics(y_val, y_predicted) print(""accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f"" % (accuracy, precision, recall, f1))",Yes,2,4.0 "# Checking the value count for different soil_types for i in range(10, df_train.shape[1]-1): j = df_train.columns[i] print (df_train[j].value_counts())",No,4,72.0 "from lightgbm import LGBMClassifier clf = LGBMClassifier(n_estimators=200, learning_rate=0.3, max_depth=3,n_jobs=-1, seed=42, objective=""multi:softmax"") clf.fit(x_train, y_train) y_predicted = clf.predict(x_val) accuracy, precision, recall, f1 = get_metrics(y_val, y_predicted) print(""accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f"" % (accuracy, precision, recall, f1))",Yes,2,4.0 "df_train = df_train.drop(['Soil_Type8', 'Soil_Type25'], axis=1) df_test = df_test.drop(['Soil_Type8', 'Soil_Type25'], axis=1) df_train1 = df_train # To be used for algos like SVM where we need normalization and StandardScaler df_test1 = df_test # To be used under normalization and StandardScaler",Yes,5,10.0 "# Checking for data transformation (take only non-categorical values) df_train.iloc[:,:10].skew()",No,5,40.0 "from scipy import stats plt.figure(figsize =(8,6)) sns.distplot(df_train1['Horizontal_Distance_To_Hydrology'], fit = stats.norm) fig = plt.figure(figsize=(8,6)) res = stats.probplot(df_train1['Horizontal_Distance_To_Hydrology'], plot=plt)",Yes,5,33.0 df_train1['Horizontal_Distance_To_Hydrology'] = np.sqrt(df_train1['Horizontal_Distance_To_Hydrology']),No,5,8.0 "plt.figure(figsize=(8,6)) sns.distplot(df_train1['Horizontal_Distance_To_Hydrology'], fit = stats.norm) fig = plt.figure(figsize=(8,6)) res = stats.probplot(df_train1['Horizontal_Distance_To_Hydrology'], plot=plt)",Yes,5,33.0 "#Vertical_Distance_To_Hydrology plt.figure(figsize=(8,6)) sns.distplot(df_train1['Vertical_Distance_To_Hydrology'], fit = stats.norm) fig = plt.figure(figsize=(8,6)) res = stats.probplot(df_train1['Vertical_Distance_To_Hydrology'], plot=plt)",Yes,5,33.0 "#Horizontal_Distance_To_Roadways plt.figure(figsize=(8,6)) sns.distplot(df_train1['Horizontal_Distance_To_Roadways'], fit=stats.norm) fig = plt.figure(figsize=(8,6)) res = stats.probplot(df_train1['Horizontal_Distance_To_Roadways'], plot=plt)",Yes,5,33.0 df_train1['Horizontal_Distance_To_Roadways'] = np.sqrt(df_train1['Horizontal_Distance_To_Roadways']),No,5,8.0 "# Plot again after sqrt transformation plt.figure(figsize=(8,6)) sns.distplot(df_train1['Horizontal_Distance_To_Roadways'], fit = stats.norm) fig = plt.figure(figsize=(8,6)) res = stats.probplot(df_train1['Horizontal_Distance_To_Roadways'], plot=plt)",Yes,5,33.0 "plt.figure(figsize=(8, 6)) sns.distplot(df_train1['Hillshade_9am'], fit=stats.norm) plt.figure(figsize=(8, 6)) res = stats.probplot(df_train1['Hillshade_9am'], plot = plt)",Yes,5,33.0 df_train['Hillshade_9am'] = np.square(df_train1['Hillshade_9am']),No,5,8.0 "plt.figure(figsize = (8,6)) sns.distplot(df_train['Hillshade_9am'], fit = stats.norm) fig = plt.figure(figsize = (8,6)) res = stats.probplot(df_train1['Hillshade_9am'], plot = plt)",Yes,5,33.0 "# Hillshade_Noon fig = plt.figure(figsize=(8,6)) sns.distplot(df_train1['Hillshade_Noon'],fit=stats.norm) fig = plt.figure(figsize=(8,6)) res = stats.probplot(df_train1['Hillshade_Noon'],plot=plt)",Yes,5,33.0 df_train1['Hillshade_Noon'] = np.square(df_train1['Hillshade_Noon']),No,5,8.0 "# Plot again after square transformation fig = plt.figure(figsize=(8,6)) sns.distplot(df_train1['Hillshade_Noon'],fit=stats.norm) fig = plt.figure(figsize=(8,6)) res = stats.probplot(df_train1['Hillshade_Noon'],plot=plt) ",Yes,5,33.0 "# Horizontal_Distance_To_Fire_Points plt.figure(figsize=(8,6)) sns.distplot(df_train1['Horizontal_Distance_To_Fire_Points'], fit=stats.norm) plt.figure(figsize=(8,6)) res = stats.probplot(df_train1['Horizontal_Distance_To_Fire_Points'],plot=plt)",Yes,5,33.0 df_train1['Horizontal_Distance_To_Fire_Points'] = np.sqrt(df_train1['Horizontal_Distance_To_Fire_Points']),No,5,8.0 "# Plot again after sqrt transformation plt.figure(figsize=(8,6)) sns.distplot(df_train1['Horizontal_Distance_To_Fire_Points'], fit=stats.norm) plt.figure(figsize=(8,6)) res = stats.probplot(df_train1['Horizontal_Distance_To_Fire_Points'],plot=plt)",Yes,5,33.0 "b""# To be used in case of algorithms like SVM\ndf_test1[['Horizontal_Distance_To_Hydrology','Horizontal_Distance_To_Fire_Points'\\\n ,'Horizontal_Distance_To_Roadways']] = np.sqrt(df_test1[['Horizontal_Distance_To_Hydrology',\\\n 'Horizontal_Distance_To_Fire_Points','Horizontal_Distance_To_Roadways']])""",No,5,8.0 "# To be used in case of algorithms like SVM df_test1[['Hillshade_9am','Hillshade_Noon']] = np.square(df_test1[['Hillshade_9am','Hillshade_Noon']])",No,5,8.0 "#non categorical variables only Size = 10 X_train_temp = df_train.iloc[:,:Size] X_test_temp = df_test.iloc[:,:Size] X_train_temp1 = df_train1.iloc[:,:Size] X_test_temp1 = df_test1.iloc[:,:Size] X_train_temp1 = StandardScaler().fit_transform(X_train_temp1) X_test_temp1 = StandardScaler().fit_transform(X_test_temp1)",Yes,4,14.0 "df_train1.iloc[:,:]",No,5,41.0 "r,c = df_train.shape X_train = np.concatenate((X_train_temp,df_train.iloc[:,Size:c-1]),axis=1) X_train1 = np.concatenate((X_train_temp1, df_train1.iloc[:,Size:c-1]), axis=1) # to be used for SVM y_train = df_train.Cover_Type.values",Yes,5,21.0 "from sklearn import svm from sklearn.model_selection import train_test_split #In the new version these are in the model_selection module. Use this: from sklearn.model_selection import learning_curve, GridSearchCV. from sklearn.model_selection import GridSearchCV, RandomizedSearchCV",No,5,22.0 "x_data, x_test_data, y_data, y_test_data = train_test_split(X_train1,y_train,test_size=0.2, random_state=123) svm_para = [{'kernel':['rbf'],'C': [1,10,100,100]}]",Yes,4,13.0 "classifier = GridSearchCV(svm.SVC(),svm_para,cv=3,verbose=2) classifier.fit(x_data,y_data) classifier.best_params_ #classifier.best_score_",Yes,4,7.0 "# Parameters optimized using the code in above cell #C_opt = 10 # reasonable option #clf = svm.SVC(C=C_opt,kernel='rbf') #clf.fit(X_train1,y_train) classifier.fit(X_train1,y_train) classifier.score(X_train1,y_train)",Yes,4,7.0 classifier.best_score_,No,5,1.0 classifier.cv_results_,No,5,2.0 df_Test1 = pd.read_csv('../input/forest-cover-type-prediction/test.csv'),No,5,45.0 "from sklearn.ensemble import ExtraTreesClassifier from sklearn.metrics import classification_report x_data, x_test_data, y_data, y_test_data = train_test_split(X_train,y_train,test_size= 0.3, random_state=0) etc_para = [{'n_estimators': [20, 30, 100], 'max_depth':[5, 10, 15], 'max_features': [0.1, 0.2, 0.3]}] #default number of features is sqrt(n) #default number of min_samples_leaf is 1",Yes,3,13.0 "ETC = GridSearchCV(ExtraTreesClassifier(),param_grid=etc_para, cv=10, n_jobs=-1) ETC.fit(x_data, y_data) ETC.best_params_ ETC.best_score_",No,5,6.0 "b""print ('Best accuracy obtained: {}'.format(ETC.best_score_))\nprint ('Parameters:')\nfor key, value in ETC.best_params_.items():\n print('\\t{}:{}'.format(key,value))""",Yes,3,49.0 "# Classification Report Y_pred = ETC.predict(x_test_data) target = ['class1', 'class2','class3','class4','class5','class6','class7' ] print (classification_report(y_test_data, Y_pred, target_names=target))",Yes,4,48.0 "from sklearn.model_selection import learning_curve from sklearn.model_selection import ShuffleSplit def plot_learning_curve(model,title, X, y,n_jobs = 1, ylim = None, cv = None,train_sizes = np.linspace(0.1, 1, 5)): # Figrue parameters plt.figure(figsize=(10,8)) plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel('Training Examples') plt.ylabel('Score') train_sizes, train_score, test_score = learning_curve(model, X, y, cv = cv, n_jobs=n_jobs, train_sizes=train_sizes) # Calculate mean and std train_score_mean = np.mean(train_score, axis=1) train_score_std = np.std(train_score, axis=1) test_score_mean = np.mean(test_score, axis=1) test_score_std = np.std(test_score, axis=1) plt.grid() plt.fill_between(train_sizes, train_score_mean - train_score_std, train_score_mean + train_score_std,\\ alpha = 0.1, color = 'r') plt.fill_between(train_sizes, test_score_mean - test_score_std, test_score_mean + test_score_std,\\ alpha = 0.1, color = 'g') plt.plot(train_sizes, train_score_mean, 'o-', color=""r"", label=""Training score"") plt.plot(train_sizes, test_score_mean, 'o-', color=""g"", label=""Cross-validation score"") plt.legend(loc = ""best"") return plt'",Yes,5,35.0 "b""# 'max_features': 0.3, 'n_estimators': 100, 'max_depth': 15, 'min_samples_leaf: 1'\netc = ExtraTreesClassifier(bootstrap=True, oob_score=True, n_estimators=100, max_depth=10, max_features=0.3, \\\n min_samples_leaf=1)\n\netc.fit(X_train, y_train)\n# yy_pred = etc.predict(X_test)\netc.score(X_train, y_train)""",Yes,3,4.0 "r,c = df_test.shape X_test = np.concatenate((X_test_temp, df_test.iloc[:,Size:c]), axis = 1) yy_pred = etc.predict(X_test) solution = pd.DataFrame({'Id':df_Test1.Id, 'Cover_Type':yy_pred}, columns = ['Id','Cover_Type']) solution.to_csv('ETCcover_sol.csv', index=False)",Yes,4,48.0 "# Plotting learning curve title = 'Learning Curve (ExtraTreeClassifier)' # cross validation with 50 iterations to have a smoother curve cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0) model = etc plot_learning_curve(model,title,X_train, y_train, n_jobs=-1,ylim=None,cv=cv) plt.show()",No,5,35.0 "data_test = pd.read_csv(""../input/test.csv"") final_test = clear_dataset(data_test)",Yes,4,45.0 "y_predicted = clf.predict(x_train) accuracy, precision, recall, f1 = get_metrics(y_train, y_predicted) print(""train accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f"" % (accuracy, precision, recall, f1))",Yes,2,27.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import MinMaxScaler from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV from sklearn.metrics import confusion_matrix, classification_report from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier import keras from keras.models import Sequential from keras.layers import * import keras.backend as K",No,5,22.0 "test = pd.read_csv(""../input/forest-cover-type-prediction/test.csv"") train = pd.read_csv(""../input/forest-cover-type-prediction/train.csv"")",No,5,45.0 "X_train_full = train.drop(['Id', 'Cover_Type'], axis=1) y_train_full = train.Cover_Type - 1 X_test = test.drop('Id', axis=1) test_id = test.Id print(X_train_full.shape) print(X_test.shape)",Yes,3,10.0 "print(list(zip(range(0,56), X_train_full.columns)))",No,5,71.0 "scaler = MinMaxScaler() Xs_train_full = scaler.fit_transform(X_train_full) Xs_test = scaler.transform(X_test)",Yes,3,4.0 "Xs_train, Xs_valid, y_train, y_valid = train_test_split(Xs_train_full, y_train_full, test_size=0.2, random_state=1, stratify=y_train_full) print(Xs_train.shape) print(Xs_valid.shape)",Yes,2,13.0 "temp = LogisticRegression(max_iter=10000) temp.fit(Xs_train, y_train) temp.score(Xs_train, y_train)",Yes,2,4.0 "np.random.seed(1) model = Sequential() model.add(Dense(512, input_shape=(54,), activation='relu')) model.add(Dense(512, activation='relu')) model.add(Dense(512, activation='relu')) model.add(Dense(7, activation='softmax')) model.summary()",No,5,84.0 "opt = keras.optimizers.Adam(lr=0.001) model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy']) h1 = model.fit(Xs_train, y_train, batch_size=20000, epochs=500, validation_data=(Xs_valid, y_valid), verbose=2)",Yes,2,4.0 "K.set_value(model.optimizer.lr, 0.0001) h2 = model.fit(Xs_train, y_train, batch_size=20000, epochs=500, validation_data=(Xs_valid, y_valid), verbose=2)",Yes,4,7.0 "K.set_value(model.optimizer.lr, 0.00001) h3 = model.fit(Xs_train, y_train, batch_size=20000, epochs=500, validation_data=(Xs_valid, y_valid), verbose=2)",Yes,4,7.0 test_pred = model.predict_classes(Xs_test),No,5,48.0 !pip install seaborn,Yes,5,87.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from xgboost import XGBClassifier",No,5,22.0 "submission_sample = pd.read_csv('../input/forest-cover-type-prediction/sampleSubmission.csv') train = pd.read_csv('../input/forest-cover-type-prediction/train.csv') test = pd.read_csv('../input/forest-cover-type-prediction/test.csv')",No,5,45.0 train.sample(5),No,5,41.0 test.sample(5),No,5,41.0 print(list(enumerate(train.columns))),No,5,71.0 train.nunique(),No,5,54.0 "submission = pd.DataFrame({ 'Id':test_id, 'Cover_Type':test_pred }) submission.head()",Yes,4,12.0 "submission.to_csv('my_submission.csv', index=False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt # Input data files are available in the read-only ""../input/"" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,5,88.0 "train_data=pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv') train_data.head()",Yes,3,45.0 "test_data=pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv') test_data.head()",Yes,3,45.0 train_data.info(),No,5,40.0 test_data.info(),No,5,40.0 from sklearn.model_selection import train_test_split,No,5,22.0 "X=train_data.drop(labels=['Id','Cover_Type'],axis=1) y=train_data['Cover_Type']",Yes,3,10.0 "print(X_train.shape,y_train.shape) print(X_val.shape,y_val.shape)",Yes,5,58.0 "rfc=RandomForestClassifier(n_estimators=70) rfc.fit(X_train,y_train)",Yes,3,4.0 "rfc.score(X_val,y_val)",No,5,49.0 "predict=rfc.predict(test_data.drop(labels=['Id'],axis=1))",Yes,3,48.0 "Submission=pd.DataFrame(data=predict,columns=['Cover_Type']) Submission.head()",Yes,2,12.0 Submission.head(),No,5,41.0 Submission.to_csv('Submission.csv'),No,5,25.0 "import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import pickle pd.set_option('display.max_columns', 100) pd.options.mode.chained_assignment = None",Yes,4,22.0 "dtrain = pd.read_csv(train_path, index_col=0) dtest = pd.read_csv(test_path, index_col=0)",No,5,45.0 dtrain['Cover_Type'].value_counts(),No,5,72.0 dtrain.info(),No,5,40.0 "# Now this includes values for all classes, better to groupyby the target variable and then get description. dtrain.describe()",No,5,40.0 "X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 10,test_size=0.2,shuffle =True)",No,5,13.0 "logistic_regression= LogisticRegression() logistic_regression.fit(X_train,y_train) y_pred=logistic_regression.predict(X_test)",Yes,4,7.0 "# fit the model on the whole dataset random_forest = RandomForestClassifier() random_forest.fit(X_train, y_train)",No,5,7.0 "decisionTreeModel = DecisionTreeClassifier(criterion= 'entropy', max_depth = None, splitter='best', random_state=10) decisionTreeModel.fit(X_train,y_train)",No,5,7.0 "KNeighborsModel = KNeighborsClassifier(n_neighbors = 7, weights = 'distance', algorithm = 'brute') KNeighborsModel.fit(X_train,y_train)",No,5,7.0 "bernoulliNBModel = BernoulliNB(alpha=0.1) bernoulliNBModel.fit(X_train,y_train)",No,5,7.0 "gaussianNBModel = GaussianNB() gaussianNBModel.fit(X_train,y_train)",No,5,7.0 "XGB_Classifier = XGBClassifier() XGB_Classifier.fit(X_train, y_train)",No,5,7.0 "#evaluation Details models = [logistic_regression, random_forest, decisionTreeModel, KNeighborsModel, bernoulliNBModel, gaussianNBModel, XGB_Classifier] for model in models: print(type(model).__name__,' Train Score is : ' ,model.score(X_train, y_train)) print(type(model).__name__,' Test Score is : ' ,model.score(X_test, y_test)) y_pred = model.predict(X_test) print(type(model).__name__,' F1 Score is : ' ,f1_score(y_test,y_pred)) print('--------------------------------------------------------------------------')",No,3,48.0 y_pred = XGB_Classifier.predict(X_test),No,5,48.0 "import seaborn as sn confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']) sn.heatmap(confusion_matrix, annot=True)",No,5,80.0 "b""from sklearn.metrics import accuracy_score,classification_report\n\nprint(accuracy_score(y_test,y_pred).round(4)*100,'\\n')\n\nprint(pd.crosstab(y_test,y_pred),'\\n')\n\nprint(classification_report(y_test,y_pred),'\\n')""",No,5,49.0 X_test.shape,No,5,58.0 test_predict = XGB_Classifier.predict(test_to_pred),No,5,48.0 "test.reset_index(inplace = True) test.head()",No,5,61.0 predict = test['EventId'],No,3,21.0 "predict = pd.concat([predict,test_predict], axis=1)",No,5,11.0 "predict.to_csv(""submission.csv"",index=False)",No,5,25.0 predict.tail(200),No,5,41.0 sb.countplot(predict.Class),No,5,33.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt # Input data files are available in the read-only ""../input/"" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,5,88.0 "train_data = pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv') train_data.head()",Yes,4,45.0 "test_data = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv') test_data.head()",Yes,4,45.0 "import seaborn as sns plt.figure(figsize=(15,10)) sns.countplot(train['Cover_Type']) plt.xlabel(""Type of Cpver"", fontsize=12) plt.ylabel(""Rows Count"", fontsize=12) plt.show()'",No,4,81.0 "# Bivariate EDA pd.crosstab(train.Soil_Type31, train.Cover_Type)",No,5,40.0 "#Convert dummy features back to categorical x = train.iloc[:,15:55] y = train.iloc[:,11:15] y = pd.DataFrame(y) x = pd.DataFrame(x) s2 = pd.Series(x.columns[np.where(x!=0)[1]]) s3 = pd.Series(y.columns[np.where(y!=0)[1]]) train['soil_type'] = s2 train['Wilderness_Area'] = s3 train.head()",Yes,4,8.0 "# Create a new dataset exluding dummies variable for Mutivariate EDA df_viz = train.iloc[:, 0:15] df_viz = df_viz.drop(['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4'], axis = 1) df_viz.head()",Yes,3,10.0 train_data['Slope'].plot(kind='hist'),No,5,33.0 test_data['Elevation'].plot(kind='hist'),No,5,33.0 train_data['Cover_Type'].value_counts()#from results it is visible that nothing is over sampled or under sampled,No,5,72.0 from sklearn.model_selection import train_test_split ,No,5,22.0 "X_train, X_val, y_train,y_val = train_test_split(X,y,random_state=40)",No,5,13.0 rfc=RandomForestClassifier(n_estimators=70),No,5,4.0 "rfc.fit(X_train,y_train)",No,5,7.0 "submission = pd.DataFrame(data=predict,columns=['Cover_Type']) submission.head()",No,3,12.0 "submission['Id'] = test_data['Id'] submission.set_index('Id',inplace=True)",No,5,55.0 submission.to_csv('Submission.csv'),No,5,25.0 "import pandas as pd import numpy as np from sklearn.metrics import confusion_matrix from sklearn.metrics import f1_score from sklearn.metrics import accuracy_score import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error from sklearn.metrics import r2_score from sklearn.preprocessing import PolynomialFeatures from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestClassifier import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from lightgbm import LGBMClassifier from sklearn.linear_model import Ridge from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV",No,5,22.0 "filepath= '/kaggle/input/forest-cover-type-prediction/train.csv' filepath1= '/kaggle/input/forest-cover-type-prediction/test.csv' testdata= pd.read_csv(filepath1) testdata2=testdata traindata= pd.read_csv(filepath) traindata.head()",No,4,45.0 "#We remove the id column in both the training and testing datasets. traindata=traindata.drop('Id',axis=1) testdata=testdata.drop('Id',axis=1)",No,5,10.0 "#working with numeric features (They are all numerical features) numeric_features = traindata.select_dtypes(include=[np.number]) numeric_features.dtypes",No,5,70.0 "#We will define the training and testing data here: y=traindata['Cover_Type'] x=traindata.drop('Cover_Type',axis=1) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.70,test_size=0.30, random_state=0)",No,5,13.0 "##Now we will run a few machine learning techiniques to see which one is the most applicable #Linear Regression linearRegressor = LinearRegression() linearRegressor.fit(x_train, y_train) y_predicted = linearRegressor.predict(x_test) mse = mean_squared_error(y_test, y_predicted) r = r2_score(y_test, y_predicted) mae = mean_absolute_error(y_test,y_predicted) print(""Mean Squared Error:"",mse) print(""R score:"",r) print(""Mean Absolute Error:"",mae)",Yes,4,7.0 "# Random Forest rf = RandomForestClassifier() rf.fit(x_train,y_train); y_predicted_r = rf.predict(x_test) mse = mean_squared_error(y_test, y_predicted_r) r = r2_score(y_test, y_predicted_r) mae = mean_absolute_error(y_test,y_predicted_r) print(""Mean Squared Error:"",mse) print(""R score:"",r) print(""Mean Absolute Error:"",mae)",Yes,4,7.0 "# Decision Tree - CART regressor = DecisionTreeRegressor(random_state = 0) regressor.fit(x_train, y_train) y_predicted_d = regressor.predict(x_test) mse = mean_squared_error(y_test, y_predicted_d) r = r2_score(y_test, y_predicted_d) mae = mean_absolute_error(y_test,y_predicted_d) print(""Mean Squared Error:"",mse) print(""R score:"",r) print(""Mean Absolute Error:"",mae)",Yes,4,7.0 "#Polynomial Regression polynomial_features= PolynomialFeatures(degree=2) x_poly = polynomial_features.fit_transform(x_train) x_poly_test = polynomial_features.fit_transform(x_test) model = LinearRegression() model.fit(x_poly, y_train) y_predicted_p = model.predict(x_poly_test) mse = mean_squared_error(y_test, y_predicted_p) r = r2_score(y_test, y_predicted_p) mae = mean_absolute_error(y_test,y_predicted_p) print(""Mean Squared Error:"",mse) print(""R score:"",r) print(""Mean Absolute Error:"",mae)",Yes,4,7.0 "#Ridge Regression ridgereg = Ridge(normalize=True) ridgereg.fit(x_train, y_train) y_pred = ridgereg.predict(x_test) mse = mean_squared_error(y_test, y_pred) r = r2_score(y_test, y_pred) mae = mean_absolute_error(y_test,y_pred) print(""Mean Squared Error:"",mse) print(""R score:"",r) print(""Mean Absolute Error:"",mae)",Yes,4,7.0 "# LGBMClassifier lgb_clf = LGBMClassifier(random_state=17) lgb_clf.fit(x_train, y_train) y_pred = lgb_clf.predict(x_test) mse = mean_squared_error(y_test, y_pred) r = r2_score(y_test, y_pred) mae = mean_absolute_error(y_test,y_pred) print(""Mean Squared Error:"",mse) print(""R score:"",r) print(""Mean Absolute Error:"",mae)",Yes,4,7.0 "#GridSearchCV param_grid = {'num_leaves': [7, 15, 31, 63], 'max_depth': [3, 4, 5, 6, -1]} grid_searcher = GridSearchCV(estimator=lgb_clf, param_grid=param_grid, cv=5, verbose=1, n_jobs=4) grid_searcher.fit(x_train, y_train) mse = mean_squared_error(y_test, y_pred) r = r2_score(y_test, y_pred) mae = mean_absolute_error(y_test,y_pred) print(""Mean Squared Error:"",mse) print(""R score:"",r) print(""Mean Absolute Error:"",mae)'",Yes,4,6.0 "# Random Forest rf = RandomForestClassifier() rf.fit(x,y); Prediction = rf.predict(testdata)",Yes,4,7.0 "predictionlist=Prediction.tolist() Passengerid=testdata2['Id'].tolist() output=pd.DataFrame(list(zip(Passengerid, predictionlist)), columns=['Id','Cover_type']) output.head() output.to_csv('my_submission(ForestCoverTypePrediction).csv', index=False)",Yes,4,25.0 "train_data = pd.read_csv(""/kaggle/input/forest-cover-type-prediction/train.csv"") train_data.shape",No,4,45.0 "test_data = pd.read_csv(""/kaggle/input/forest-cover-type-prediction/test.csv"") test_data.shape",No,4,45.0 "train_data.columns ",No,5,71.0 test_data.columns,No,5,71.0 train_data['Cover_Type'].value_counts(),No,5,72.0 "from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test = train_test_split(X,y)",No,5,13.0 "print(X_train.shape,y_train.shape) print(X_test.shape,y_test.shape)",Yes,4,14.0 "from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=5) knn.fit(X_train,y_train) knn.score(X_test,y_test)",No,3,7.0 "pred = knn.predict(test_data.drop(""Id"",axis=1))",Yes,4,7.0 "submission = pd.DataFrame(data=pred,columns=[""Cover_Type""]) submission[""Id""] = test_data[""Id""] submission.set_index(""Id"",inplace=True)",No,5,55.0 "submission.to_csv(""Submission.csv"")",No,5,25.0 "import numpy as np import pandas as pd import seaborn as sns from sklearn.metrics import accuracy_score,confusion_matrix,roc_curve,auc from sklearn.preprocessing import StandardScaler from xgboost import XGBClassifier from sklearn.model_selection import train_test_split",No,5,22.0 "df_train = pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv') df_test = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv') ",No,5,45.0 df_train.shape,No,5,58.0 "test_id =df_test['Id'] train_id = df_train ['Id']",No,5,77.0 df_train.isnull().sum(),No,5,39.0 df_test.isnull().sum(),No,5,39.0 df_test.columns,No,5,71.0 df_test.dtypes,No,5,70.0 "# From both train and test data df_train.drop(['Id'], axis = 1,inplace = True) df_test.drop(['Id'], axis = 1,inplace = True)",No,5,10.0 sns.heatmap(df_train.isnull()),No,5,80.0 sns.heatmap(df_test.isnull()),No,5,80.0 "corrmat = df_train.corr() sns.heatmap(corrmat,vmax = 0.8,square = True)",No,5,80.0 data_corr.head(),No,5,41.0 "plt.figure(figsize=(15,10)) pd.crosstab(train.Wilderness_Area, train.Cover_Type).plot.barh(figsize=(15,15),stacked = True)",No,5,33.0 "plt.figure(figsize=(15,10)) pd.crosstab(train.soil_type, train.Cover_Type).plot.barh(figsize=(15,15),stacked = True)",No,5,33.0 "plt.subplots(figsize=(10,10)) corr = df_viz.corr() ax = sns.heatmap( corr, vmin=-1, vmax=1, center=0, cmap=sns.diverging_palette(20, 220, n=200), square=True ) ax.set_xticklabels( ax.get_xticklabels(), rotation=45, horizontalalignment='right' ); ",No,5,80.0 "def add_feature(data): data['Ele_minus_VDtHyd'] = data.Elevation-data.Vertical_Distance_To_Hydrology data['Ele_plus_VDtHyd'] = data.Elevation+data.Vertical_Distance_To_Hydrology data['Distanse_to_Hydrolody'] = (data['Horizontal_Distance_To_Hydrology']**2+data['Vertical_Distance_To_Hydrology']**2)**0.5 data['Hydro_plus_Fire'] = data['Horizontal_Distance_To_Hydrology']+data['Horizontal_Distance_To_Fire_Points'] data['Hydro_minus_Fire'] = data['Horizontal_Distance_To_Hydrology']-data['Horizontal_Distance_To_Fire_Points'] data['Hydro_plus_Road'] = data['Horizontal_Distance_To_Hydrology']+data['Horizontal_Distance_To_Roadways'] data['Hydro_minus_Road'] = data['Horizontal_Distance_To_Hydrology']-data['Horizontal_Distance_To_Roadways'] data['Fire_plus_Road'] = data['Horizontal_Distance_To_Fire_Points']+data['Horizontal_Distance_To_Roadways'] data['Fire_minus_Road'] = data['Horizontal_Distance_To_Fire_Points']-data['Horizontal_Distance_To_Roadways'] return data",No,5,8.0 "train = add_feature(train) test = add_feature(test)",No,4,8.0 "X_train = train.drop(['Id','Cover_Type','soil_type','Wilderness_Area'], axis = 1) y_train = train.Cover_Type X_test = test.drop(['Id'], axis = 1)",No,5,21.0 "%%time lr_pipe = Pipeline( steps = [ ('scaler', MinMaxScaler()), ('classifier', LogisticRegression(solver='lbfgs', n_jobs=-1)) ] ) lr_param_grid = { 'classifier__C': [1, 10, 100,1000], } np.random.seed(1) grid_search = GridSearchCV(lr_pipe, lr_param_grid, cv=5, refit='True') grid_search.fit(X_train, y_train) print(grid_search.best_score_) print(grid_search.best_params_)",No,4,6.0 "%%time rf_pipe = Pipeline( steps = [ ('classifier', RandomForestClassifier(n_estimators=500)) ] ) param_grid = { 'classifier__min_samples_leaf': [2, 3, 4, 8], 'classifier__max_depth': [30, 32, 34], } np.random.seed(1) rf_grid_search = GridSearchCV(rf_pipe, param_grid, cv=5, refit='True', n_jobs=-1) rf_grid_search.fit(X_train, y_train) print(rf_grid_search.best_score_) print(rf_grid_search.best_params_)",No,5,6.0 "rf_model = rf_grid_search.best_estimator_ cv_score = cross_val_score(rf_model, X_train, y_train, cv = 5) print(cv_score) print(""Accuracy: %0.2f (+/- %0.2f)"" % (cv_score.mean(), cv_score.std() * 2))",Yes,5,1.0 rf = rf_grid_search.best_estimator_.steps[0][1],No,3,2.0 "feat_imp = rf.feature_importances_ feat_imp_df = pd.DataFrame({ 'feature':X_train.columns, 'feat_imp':feat_imp }) feat_imp_df.sort_values(by='feat_imp', ascending=False).head(10)",Yes,5,79.0 "sorted_feat_imp_df = feat_imp_df.sort_values(by='feat_imp', ascending=True) plt.figure(figsize=[6,6]) plt.barh(sorted_feat_imp_df.feature[-20:], sorted_feat_imp_df.feat_imp[-20:]) plt.show()",No,5,79.0 "%%time xgd_pipe = Pipeline( steps = [ ('classifier', XGBClassifier(n_estimators=50, subsample=0.5)) ] ) param_grid = { 'classifier__learning_rate' : [0.45], 'classifier__min_samples_split' : [8, 16, 32], 'classifier__min_samples_leaf' : [2], 'classifier__max_depth': [15] } np.random.seed(1) xgd_grid_search = GridSearchCV(xgd_pipe, param_grid, cv=5, refit='True', verbose = 10, n_jobs=-1) xgd_grid_search.fit(X_train, y_train) print(xgd_grid_search.best_score_) print(xgd_grid_search.best_params_)",Yes,4,6.0 "xgd_model = xgd_grid_search.best_estimator_ cv_score = cross_val_score(xgd_model, X_train, y_train, cv = 5) print(cv_score) print(""Accuracy: %0.2f (+/- %0.2f)"" % (cv_score.mean(), cv_score.std() * 2))",Yes,5,28.0 final_model = xgd_grid_search.best_estimator_.steps[0][1],No,5,3.0 "final_model.fit(X_train, y_train) y_pred = final_model.predict(X_test)",Yes,4,7.0 print(len(test.Id)),No,5,40.0 print(len(y_pred)),No,5,40.0 "from collections import Counter Counter(y_pred)",Yes,5,72.0 submission_sample.head(),No,5,41.0 "submission = pd.DataFrame({'Id': test.Id, 'Cover_Type': y_pred}) submission.head()",Yes,5,55.0 test_data['Slope'].plot(kind='hist'),No,5,33.0 train_data['Elevation'].plot(kind='hist'),No,5,33.0 "X=train_data.drop(labels=['Id','Cover_Type'],axis=1)",No,5,10.0 y=train_data['Cover_Type'],No,5,21.0 "X_train,X_val,y_train,y_val=train_test_split(X,y,random_state=40)",No,5,13.0 "rfc=RandomForestClassifier(n_estimators=70) rfc.fit(X_train,y_train) ",Yes,5,7.0 "Submission=pd.DataFrame(data=predict,columns=['Cover_type']) Submission.head()",Yes,5,55.0 "Submission['Id']=test_data['Id'] Submission.set_index('Id',inplace=True) Submission.head()",Yes,5,55.0 "dataset_train = pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv') dataset_test = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv') dataset_train_copy = dataset_train.copy() dataset_test_copy = dataset_test.copy()",No,4,45.0 dataset_train.shape,No,5,58.0 dataset_test.shape,No,5,58.0 "dataset_train_copy.drop('Id', axis=1, inplace=True) dataset_test_copy.drop('Id', axis=1, inplace=True)",No,5,10.0 "X = dataset_train_copy.iloc[:, :-1].values y = dataset_train_copy.iloc[:, -1].values X_submission = dataset_test_copy.iloc[:, :].values",No,3,21.0 "from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X) X_submission = sc.transform(X_submission) y_train = y",Yes,5,18.0 "from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(n_estimators = 30, criterion = 'entropy', random_state = 0) classifier.fit(X_train, y_train)",Yes,5,7.0 "y_submission = classifier.predict(X_submission) dataset_submission = pd.DataFrame({'Id':dataset_test.iloc[:,0], 'Cover_Type': y_submission}) dataset_submission.set_index('Id', inplace=True)",Yes,5,55.0 dataset_submission,No,5,41.0 dataset_submission.to_csv('Submission.csv'),No,5,25.0 "train_df=pd.read_csv('/kaggle/input/forest-cover-type-prediction/train.csv') train_df.head()",Yes,4,45.0 "test_df=pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv') test_df.head()",Yes,4,45.0 train_df.shape,No,5,58.0 test_df.shape,No,5,58.0 test_id=test_df['Id'],No,3,14.0 "train_df.drop(['Id'],axis=1,inplace=True) test_df.drop(['Id'],axis=1,inplace=True)",No,5,10.0 "X=train_df.drop(['Cover_Type'],axis=1) y=train_df['Cover_Type']",No,5,21.0 "X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=40)",No,5,13.0 "from sklearn.neighbors import KNeighborsClassifier KNN=KNeighborsClassifier(n_neighbors=6) KNN.fit(X_train,y_train)",Yes,5,7.0 "KNN.score(X_test,y_test)",No,5,49.0 pred=KNN.predict(test_df),No,5,48.0 "result=pd.DataFrame(data=pred,columns=['Cover_Type'])",No,5,12.0 "result['Id']=test_id result.set_index('Id',inplace=True) result.head()",Yes,3,55.0 result.to_csv('Submission.csv'),No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt # Input data files are available in the read-only ""../input/"" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",Yes,5,88.0 "test_data = pd.read_csv('/kaggle/input/forest-cover-type-prediction/test.csv') test_data.head()",Yes,4,45.0 "X = train_data.drop(labels = ['Id','Cover_Type'],axis = 1) Y = train_data['Cover_Type']",No,5,21.0 "X_train,X_val,Y_train,Y_val = train_test_split(X,Y,random_state = 40)",No,5,13.0 "print(X_train.shape,Y_train.shape) print(X_val.shape,Y_val.shape)",No,5,58.0 "rfc=RandomForestClassifier(n_estimators=70) rfc.fit(X_train,Y_train)",Yes,5,7.0 "rfc.score(X_val,Y_val)",No,5,49.0 "Submission['Id']=test_data['Id'] Submission.set_index('Id',inplace=True)",No,5,55.0 Submission.to_csv('Submission_first_time.csv'),No,5,25.0 train_data.columns,No,5,71.0 "KNN = KNeighborsClassifier(n_neighbors = 11, n_jobs = -1) KNN.fit(X_train,y_train)",Yes,5,7.0 "KNN.score(X_val,y_val)",No,5,49.0 "predict=KNN.predict(test_data.drop(labels=['Id'],axis=1))",No,5,48.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt %matplotlib inline # Input data files are available in the read-only ""../input/"" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,5,88.0 "Submission['Id']=test_data['Id'] Submission.set_index('Id',inplace=True) ",No,5,55.0 "import numpy as np import pandas as pd from sklearn import cross_validation, grid_search, linear_model, metrics, pipeline, preprocessing",No,5,22.0 "def rmsle(y, y_): log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y])) log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_])) calc = (log1 - log2) ** 2 return np.sqrt(np.mean(calc))",No,5,84.0 "data = pd.read_csv(""../input/train.csv"")",No,5,45.0 data.head(3),No,5,41.0 data.isnull().values.any(),No,4,39.0 "data.datetime = data.datetime.apply(pd.to_datetime) data['month'] = data.datetime.apply(lambda x : x.month) data['hour'] = data.datetime.apply(lambda x : x.hour) data.head()",No,4,8.0 "train_data = data.iloc[:-1000, :] test_data = data.iloc[-1000:, :] print(data.shape, train_data.shape, test_data.shape) ",Yes,4,13.0 "train_labels = train_data['count'].values train_data = train_data.drop(['datetime', 'count', 'casual', 'registered'], axis = 1) test_labels = test_data['count'].values test_data = test_data.drop(['datetime', 'count', 'casual', 'registered'], axis = 1)",No,4,21.0 "binary_data_columns = ['holiday', 'workingday'] binary_data_indices = np.array([(column in binary_data_columns) for column in train_data.columns], dtype = bool) categorical_data_columns = ['season', 'weather', 'month'] categorical_data_indices = np.array([(column in categorical_data_columns) for column in train_data.columns], dtype = bool) numeric_data_columns = ['temp', 'atemp', 'humidity', 'windspeed', 'hour'] numeric_data_indices = np.array([(column in numeric_data_columns) for column in train_data.columns], dtype = bool)",No,4,37.0 "transformer_list = [ #binary ('binary_variables_processing', preprocessing.FunctionTransformer(lambda data: data[:, binary_data_indices])), #numeric ('numeric_variables_processing', pipeline.Pipeline(steps = [ ('selecting', preprocessing.FunctionTransformer(lambda data: data[:, numeric_data_indices])), ('scaling', preprocessing.StandardScaler(with_mean = 0)) ])), #categorical ('categorical_variables_processing', pipeline.Pipeline(steps = [ ('selecting', preprocessing.FunctionTransformer(lambda data: data[:, categorical_data_indices])), ('hot_encoding', preprocessing.OneHotEncoder(handle_unknown = 'ignore')) ])), ]",No,5,8.0 regressor = linear_model.Lasso(max_iter = 2000),No,5,4.0 "estimator = pipeline.Pipeline(steps = [ ('feature_processing', pipeline.FeatureUnion(transformer_list=transformer_list)), ('model_fitting', regressor) ] ) estimator.fit(train_data, train_labels) predicted = estimator.predict(test_data) print(""RMSLE: "", rmsle(test_labels, predicted)) print(""MAE: "", metrics.mean_absolute_error(test_labels, predicted))'",Yes,4,49.0 "parameters_grid = { 'model_fitting__alpha' : [0.1, 1, 2, 3, 4, 10, 30] }",No,5,5.0 "rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False) grid_cv = grid_search.GridSearchCV(estimator, parameters_grid, scoring = rmsle_scorer, cv = 4) grid_cv.fit(train_data, train_labels) predicted = grid_cv.best_estimator_.predict(test_data) print(""RMSLE: "", rmsle(test_labels, predicted)) #print(""MAE: "", metrics.mean_absolute_error(test_labels, predicted)) print(""Best params: "", grid_cv.best_params_)",No,5,2.0 estimator.get_params().keys(),No,5,79.0 "from sklearn.ensemble import RandomForestRegressor regressor = RandomForestRegressor(random_state = 0, max_depth = 20, n_estimators = 150) estimator = pipeline.Pipeline(steps = [ ('feature_processing', pipeline.FeatureUnion(transformer_list = transformer_list)), ('model_fitting', regressor) ] ) estimator.fit(train_data, train_labels) #metrics.mean_absolute_error(test_labels, estimator.predict(test_data)) print(""RMSLE: "", rmsle(test_labels, estimator.predict(test_data)))'",Yes,4,49.0 "%pylab inline pylab.figure(figsize=(8, 3)) pylab.subplot(1,2,1) pylab.grid(True) pylab.xlim(-100,1100) pylab.ylim(-100,1100) pylab.scatter(train_labels, grid_cv.best_estimator_.predict(train_data), alpha=0.5, color = 'red') pylab.scatter(test_labels, grid_cv.best_estimator_.predict(test_data), alpha=0.5, color = 'blue') pylab.title('linear model') pylab.subplot(1,2,2) pylab.grid(True) pylab.xlim(-100,1100) pylab.ylim(-100,1100) pylab.scatter(train_labels, estimator.predict(train_data), alpha=0.5, color = 'red') pylab.scatter(test_labels, estimator.predict(test_data), alpha=0.5, color = 'blue') pylab.title('random forest model')",No,5,56.0 "from sklearn.ensemble import GradientBoostingRegressor gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.9, max_depth = 4) estimator = pipeline.Pipeline(steps = [ ('feature_processing', pipeline.FeatureUnion(transformer_list = transformer_list)), ('model_fitting', gbr) ] ) estimator.fit(train_data, train_labels) #metrics.mean_absolute_error(test_labels, estimator.predict(test_data)) print(""RMSLE: "", rmsle(test_labels, estimator.predict(test_data)))'",Yes,4,49.0 "%pylab inline pylab.figure(figsize=(8, 3)) pylab.subplot(1,2,1) pylab.grid(True) pylab.xlim(-100,1100) pylab.ylim(-100,1100) pylab.scatter(train_labels, grid_cv.best_estimator_.predict(train_data), alpha=0.5, color = 'red') pylab.scatter(test_labels, grid_cv.best_estimator_.predict(test_data), alpha=0.5, color = 'blue') pylab.title('linear model') pylab.subplot(1,2,2) pylab.grid(True) pylab.xlim(-100,1100) pylab.ylim(-100,1100) pylab.scatter(train_labels, estimator.predict(train_data), alpha=0.5, color = 'red') pylab.scatter(test_labels, estimator.predict(test_data), alpha=0.5, color = 'blue') pylab.title('gbr model')",No,5,56.0 "real_test_data = pd.read_csv(""../input/test.csv"") real_test_data_ids = real_test_data[""datetime""] real_test_data.head()",No,4,45.0 "real_test_data.datetime = real_test_data.datetime.apply(pd.to_datetime) real_test_data['month'] = real_test_data.datetime.apply(lambda x : x.month) real_test_data['hour'] = real_test_data.datetime.apply(lambda x : x.hour) real_test_data.head()",No,4,8.0 "real_test_data = real_test_data.drop(['datetime'], axis = 1)",No,5,10.0 real_test_predictions = estimator.predict(real_test_data),No,5,48.0 "submission.to_csv('bike_predictions.csv', index=False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import seaborn as sns import matplotlib.pyplot as plt # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory from subprocess import check_output print(check_output([""ls"", ""../input""]).decode(""utf8"")) # Any results you write to the current directory are saved as output. '",No,5,88.0 "# load train_set&test_set train=pd.read_csv(""../input/train.csv"") test=pd.read_csv(""../input/test.csv"") test['casual']=0 test['registered']=0 test['count']=0 #remove Outlier piont train = train[np.abs(train[""count""]-train[""count""].mean())<=(3*train[""count""].std())] '",Yes,4,45.0 "#create a union data union_data=pd.concat([train,test],ignore_index=True)",No,5,11.0 "#add date columns union_data['day']=pd.to_datetime(union_data.datetime).dt.day union_data['year']=pd.to_datetime(union_data.datetime).dt.year union_data['month']=pd.to_datetime(union_data.datetime).dt.month union_data['weekday']=pd.to_datetime(union_data.datetime).dt.weekday union_data['date']=pd.to_datetime(union_data.datetime).dt.date union_data['hour']=pd.to_datetime(union_data.datetime).dt.hour union_data['year_season']=union_data.apply(lambda x:'{}_{}'.format(str(x['year']),str(x['season'])),axis=1) union_data['year_month']=union_data.apply(lambda x:'{}_{}'.format(str(x['year']),str(x['month'])),axis=1) #missing data fill union_data['windspeed']=union_data[['year','month','hour','windspeed']].groupby(['year','month','hour']).transform(lambda x:x.replace(0,np.median([i for i in x if i>0]))) union_data['windspeed']=pd.cut(union_data['windspeed'],bins=[0,20,60],labels=['0','1']) ",Yes,4,16.0 "#add day_type columns union_data['day_type']=0 union_data['day_type'][(union_data['holiday']==0)& (union_data['workingday']==0)]='weekend' union_data['day_type'][(union_data['holiday']==0)& (union_data['workingday']==1)]='workingday' union_data['day_type'][(union_data['holiday']==1)]='holiday' ",No,5,20.0 "#create train set train=union_data[:10739]",No,4,13.0 "#windspeed counts plt.figure(figsize=(100,5)) g=sns.factorplot(x='windspeed',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,33.0 "#season trend g=sns.factorplot(x='season',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,33.0 "#month trend g=sns.factorplot(x='month',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,75.0 "#day trend g=sns.factorplot(x='day',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,75.0 "#weekday trend g=sns.factorplot(x='weekday',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,75.0 "#hour trend g=sns.factorplot(x='hour',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,75.0 "#weather analyse g=sns.factorplot(x='weather',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,33.0 "#workingday analyse g=sns.factorplot(x='workingday',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,33.0 "#tempture analyse g=sns.factorplot(x='temp',y='count',data=train,col='year',kind='bar',estimator=sum,ci=None,size=10,aspect=1)",No,5,33.0 "from sklearn import tree clf = tree.tree.DecisionTreeRegressor(max_depth=4,criterion='mse',min_samples_leaf=800) clf = clf.fit(train['hour'].reshape(-1,1),np.ravel(train['count'])) import graphviz dot_data = tree.export_graphviz(clf, out_file=None,feature_names=['hour'], filled=True, rounded=True, special_characters=True,) graph = graphviz.Source(dot_data) graph #dot_data = tree.export_graphviz(clf, out_file=None,feature_names=train[['hour']].columns.values,class_names=train[['count']].columns.values) #graph = graphviz.Source(dot_data) #graph ",No,3,7.0 train_X.columns,No,5,71.0 "regr = RandomForestRegressor(n_estimators=300) regr.fit(train_X.loc[:,'year_month':], np.ravel(train_y)) reg=GradientBoostingRegressor(n_estimators=2000, learning_rate=0.01,max_depth=4) reg.fit(train_X.loc[:,'year_month':], np.ravel(train_y)) ",No,5,7.0 "np.exp(regr.predict(test_X.loc[:,'year_month':]))-1 ",No,5,55.0 "np.exp(reg.predict(test_X.loc[:,'year_month':]))-1",No,5,48.0 "union_data['count'][10739:]=np.exp(reg.predict(test_X.loc[:,'year_month':]))-1",No,5,8.0 "submission=pd.DataFrame({ ""datetime"": union_data[10739:].datetime, ""count"": union_data[10739:]['count'] }) submission.to_csv('bike_predictions_gbm_separate_without_fe.csv', index=False)'",No,5,25.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sn from datetime import datetime from scipy import stats",Yes,4,22.0 "trainData = pd.read_csv('../input/train.csv') testData = pd.read_csv('../input/test.csv')",No,5,45.0 trainData.head(2),No,5,41.0 "fig, axes = plt.subplots(figsize=(15, 4), ncols=2, nrows=1) sn.distplot(trainData[""count""],ax=axes[0]) plt.plot(pd.rolling_mean(trainData['count'], 100)) plt.show()'",No,5,33.0 "trainData['logcount'] = trainData['count'].apply(lambda x: np.log1p(x)) fig, axes = plt.subplots(figsize=(15, 8)) sn.distplot(trainData[""logcount""], ax=axes)'",No,5,33.0 "trainData['date'] = trainData.datetime.apply(lambda x : x.split()[0]) trainData['hour'] = trainData.datetime.apply(lambda x : x.split()[1].split("":"")[0]) trainData['weekday'] = trainData.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').weekday()) trainData['month'] = trainData.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').month) testData['date'] = testData.datetime.apply(lambda x : x.split()[0]) testData['hour'] = testData.datetime.apply(lambda x : x.split()[1].split("":"")[0]) testData['weekday'] = testData.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').weekday()) testData['month'] = testData.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').month) timeColumn = testData['datetime']'",No,5,8.0 "import xgboost as xgb X = trainData.drop(['count', 'datetime', 'registered', 'casual', 'date', 'logcount'], axis=1).values Y = trainData['logcount'].values testX = testData.drop(['datetime', 'date'], axis=1).values trainMatrix = xgb.DMatrix(X, label=Y) max_depth = 5 min_child_weight = 8 subsample = 0.9 num_estimators = 1000 learning_rate = 0.1 clf = xgb.XGBRegressor(max_depth=max_depth, min_child_weight=min_child_weight, subsample=subsample, n_estimators=num_estimators, learning_rate=learning_rate) clf.fit(X,Y) pred = clf.predict(testX) pred = np.expm1(pred) submission = pd.DataFrame({ ""datetime"": timeColumn, ""count"": pred }) submission.to_csv('XGBNoFE.csv', index=False)'",Yes,2,25.0 "fig, axes = plt.subplots(nrows=1,ncols=2) fig.set_size_inches(15, 8) sn.boxplot(data=trainData, y='count', x='season', ax=axes[0]) sn.boxplot(data=trainData, y='count', x='workingday', ax=axes[1]) axes[0].set(xlabel='season', ylabel='count') axes[1].set(xlabel='workingday', ylabel='count')",No,5,33.0 "fix, axes = plt.subplots(figsize=(15, 10)) sn.boxplot(data=trainData, y='count', x='hour', ax=axes)",No,5,75.0 "corrMat = trainData.corr() mask = np.array(corrMat) mask[np.tril_indices_from(mask)] = False fig, ax= plt.subplots(figsize=(20, 10)) sn.heatmap(corrMat, mask=mask,vmax=1., square=True,annot=True)",No,5,80.0 "fig, axes = plt.subplots(nrows=4, ncols=1, figsize=(15, 15)) meanMonthly = pd.DataFrame(trainData.groupby('month')['count'].mean()).reset_index().sort_values(by='count', ascending=False) sn.barplot(data=meanMonthly, x='month', y='count', ax=axes[0]) axes[0].set(xlabel='month', ylabel='count') hoursSeasonly = pd.DataFrame(trainData.groupby(['hour', 'season'], sort=True)['count'].mean()).reset_index() sn.pointplot(x=hoursSeasonly['hour'], y=hoursSeasonly['count'], hue=hoursSeasonly['season'], data=hoursSeasonly, join=True, ax=axes[1]) axes[1].set(xlabel='hour', ylabel='count') hoursDayly = pd.DataFrame(trainData.groupby(['hour','weekday'], sort=True)['count'].mean()).reset_index() sn.pointplot(x=hoursDayly['hour'], y=hoursDayly['count'], hue=hoursDayly['weekday'], data=hoursDayly, join=True,ax=axes[2]) axes[2].set(xlabel='hour', ylabel='count') hoursSeasonly = pd.DataFrame(trainData.groupby(['hour', 'month'], sort=True)['count'].mean()).reset_index() sn.pointplot(x=hoursSeasonly['hour'], y=hoursSeasonly['count'], hue=hoursSeasonly['month'], data=hoursSeasonly, join=True, ax=axes[3]) axes[1].set(xlabel='hour', ylabel='count')",No,5,75.0 "X = trainData.drop(['date', 'temp', 'casual', 'registered', 'logcount', 'datetime', 'count'], axis=1) season_df = pd.get_dummies(trainData['season'], prefix='s', drop_first=True) weather_df = pd.get_dummies(trainData['weather'], prefix='w', drop_first=True) hour_df = pd.get_dummies(trainData['hour'], prefix='h', drop_first=True) weekday_df = pd.get_dummies(trainData['weekday'], prefix='d', drop_first=True) month_df = pd.get_dummies(trainData['month'], prefix='m', drop_first=True) X = X.join(season_df) X = X.join(weather_df) X = X.join(hour_df) X = X.join(weekday_df) X = X.join(month_df) X = X.values Y=trainData['logcount'].values print(X.shape) testX = testData.drop(['date', 'temp', 'datetime'], axis=1) season_df = pd.get_dummies(testData['season'], prefix='s', drop_first=True) weather_df = pd.get_dummies(testData['weather'], prefix='w', drop_first=True) hour_df = pd.get_dummies(testData['hour'], prefix='h', drop_first=True) weekday_df = pd.get_dummies(testData['weekday'], prefix='d', drop_first=True) month_df = pd.get_dummies(testData['month'], prefix='m', drop_first=True) testX = testX.join(season_df) testX = testX.join(weather_df) testX = testX.join(hour_df) testX = testX.join(weekday_df) testX = testX.join(month_df) testX = testX.values print(testX.shape)",Yes,4,20.0 "clf=xgb.XGBRegressor(max_depth=8,min_child_weight=6,gamma=0.4,colsample_bytree=0.6,subsample=0.6) clf.fit(X,Y) pred = clf.predict(testX) pred = np.expm1(pred) submission = pd.DataFrame({ ""datetime"": timeColumn, ""count"": pred }) submission.to_csv('XGBwithFE.csv', index=False)'",Yes,4,25.0 "from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import make_scorer ",No,5,22.0 "def loss_func(truth, prediction): truth = np.expm1(truth) prediction = np.expm1(prediction) log1 = np.array([np.log(x + 1) for x in truth]) log2 = np.array([np.log(x + 1) for x in prediction]) return np.sqrt(np.mean((log1 - log2)**2))",No,5,84.0 "b""param_grid = {\n 'n_estimators': [50, 80, 100, 120],\n 'max_depth': [None, 1, 2, 5],\n 'max_features': ['sqrt', 'log2', 'auto']\n}\n\nscorer = make_scorer(loss_func, greater_is_better=False)\n\nregr = RandomForestRegressor(random_state=42)\n\nrfr = GridSearchCV(regr, param_grid, cv=4, scoring=scorer, n_jobs=4).fit(X, Y)\nprint('\\tParams:', rfr.best_params_)\nprint('\\tScore:', rfr.best_score_)""",No,4,2.0 "pred = rfr.predict(testX) pred = np.expm1(pred) submission = pd.DataFrame({ ""datetime"": timeColumn, ""count"": pred }) submission.to_csv('RandomForest.csv', index=False)'",Yes,4,48.0 "b""#\n#param_grid = {\n# 'learning_rate': [0.1, 0.01, 0.001, 0.0001],\n# 'n_estimators': [100, 1000, 1500, 2000, 4000],\n# 'max_depth': [1, 2, 3, 4, 5, 8, 10]\n#}\n#\n#scorer = make_scorer(loss_func, greater_is_better=False)\n#\n#gb = GradientBoostingRegressor(random_state=42)\n#\n#gbr = GridSearchCV(gb, param_grid, cv=4, scoring=scorer, n_jobs=3).fit(X, Y)\n#print('\\tParams:', gbr.best_params_)\n#print('\\tScore:', gbr.best_score_)\n\ngbr = GradientBoostingRegressor(n_estimators=2000, learning_rate=0.01, max_depth=4)\n\ngbr.fit(X, Y)""",No,5,7.0 "pred = gbr.predict(testX) pred = np.expm1(pred) submission = pd.DataFrame({ ""datetime"": timeColumn, ""count"": pred }) submission.to_csv('GradientBoost.csv', index=False)'",Yes,4,48.0 "df = pd.read_csv('../input/train.csv') df.head()",No,4,45.0 "def null_percentage(column): df_name = column.name nans = np.count_nonzero(column.isnull().values) total = column.size frac = nans / total perc = int(frac * 100) print('%d%% of values or %d missing from %s column.' % (perc, nans, df_name)) def check_null(df, columns): for col in columns: null_percentage(df[col]) check_null(df, df.columns)",No,5,39.0 "def process_features(df): # Get month, day of month, and time of day. months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] df['month'] = df.datetime.apply(lambda x: months[int(x[5:7]) - 1]) df['day'] = df.datetime.apply(lambda x: x[8:10]).astype(int) df['hour'] = df.datetime.apply(lambda x: x[11:13]).astype(int) def get_season(m): if m in ['January', 'February', 'December']: return 'Winter' elif m in [ 'March', 'April', 'May']: return 'Spring' elif m in ['June', 'July','August']: return 'Summer' else: return 'Fall' df['real_seasons'] = df.month.apply(lambda x: get_season(x)) # Change ""feels like"" temperature to deviation from the mean of 24, which is a comfortable temperature. median_temp = df.atemp.median() df['temp_dev'] = df.atemp.apply(lambda x: x - median_temp) # Create a date object and use it to extract day of week. df['date'] = df.datetime.apply(lambda x: dt.strptime(x, ""%Y-%m-%d %H:%M:%S"").date()) weekdays = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'] df['day_of_week'] = df.date.apply(lambda x: weekdays[x.weekday()]) df['weekend'] = df.day_of_week.apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0) df = df.drop(['date', 'datetime'], axis=1) print(df.columns) return df'",Yes,4,8.0 df = process_features(pd.read_csv('../input/train.csv')),No,4,45.0 "#Ridership by Month plt.figure('Daily rides by Day of Week', figsize=(10, 26)) plt.suptitle('Daily Rides by Day of Week', fontsize=20) plt.subplot(311) sns.boxplot(x='day_of_week', y='count', data=df) plt.title('All Riders', fontsize=16) plt.subplot(312) sns.boxplot(x='day_of_week', y='casual', data=df) plt.title('Casual Riders', fontsize=16) plt.subplot(313) sns.boxplot(x='day_of_week', y='registered', data=df) plt.title('Registered Riders', fontsize=16) plt.show()",No,5,75.0 "#Ridership by Season plt.figure('Daily rides by Season', figsize=(10, 20)) plt.suptitle('Daily Rides by Season', fontsize=20) plt.subplot(311) sns.boxplot(x='real_seasons', y='count', hue='weekend', data=df) plt.title('All Riders', fontsize=16) plt.subplot(312) sns.boxplot(x='real_seasons', y='casual', hue='weekend', data=df) plt.title('Casual Riders', fontsize=16) plt.subplot(313) sns.boxplot(x='real_seasons', y='registered', hue='weekend', data=df) plt.title('Registered Riders', fontsize=16) plt.show()",No,5,33.0 "#Ridership by Season plt.figure('Daily rides by Month', figsize=(10, 20)) plt.suptitle('Daily Rides by Month', fontsize=20) plt.subplot(311) sns.boxplot(x='month', y='count', hue='weekend', data=df) plt.title('All Riders', fontsize=16) plt.subplot(312) sns.boxplot(x='month', y='casual', hue='weekend', data=df) plt.title('Casual Riders', fontsize=16) plt.subplot(313) sns.boxplot(x='month', y='registered', hue='weekend', data=df) plt.title('Registered Riders', fontsize=16) plt.show()",No,5,75.0 "plt.figure('Wind by month') sns.boxplot(x='month', y='windspeed', data=df) plt.title('Windspeed by Month', fontsize=20) plt.show()",No,5,75.0 df.weather.value_counts(),No,5,72.0 "plt.figure('Weather and Ridership', figsize=(10, 20)) plt.suptitle('Weather and Ridership', fontsize=20) plt.subplot(311) sns.boxplot(x='weather', y='count', data=df) plt.title('All Riders', fontsize=14) plt.subplot(312) sns.boxplot(x='weather', y='casual', data=df) plt.title('Casual Riders', fontsize=14) plt.subplot(313) sns.boxplot(x='weather', y='registered', data=df) plt.title('Registered Riders', fontsize=14) plt.show()",Yes,5,33.0 "def corr_heatmap(df, title): plt.figure('heatmap', figsize=(15,15)) plt.suptitle(plt.title(title, fontsize=30)) df_corr = df.corr() sns.heatmap(df_corr, vmax=0.6, square=True, annot=False, cmap='Blues') plt.yticks(rotation = 0) plt.xticks(rotation = 90) plt.show() corr_heatmap(pd.get_dummies(df), 'Correlation Matrix of All Features')",No,5,80.0 "import pandas as pd df = process_features(pd.read_csv('../input/train.csv')) df_submit = process_features(pd.read_csv('../input/test.csv')) def clean_weather(df): df.loc[df['weather'] == 4, 'weather'] = 3 return df df = clean_weather(df) df_test = clean_weather(df_submit)",Yes,4,45.0 "remove_columns = ['season', #'holiday', 'workingday', #'weather', #'temp', 'atemp', #'humidity', 'windspeed', #'month', 'day', #'hour', #'real_seasons', #'temp_dev', #'day_of_week', 'weekend' ] # Going to make this a multi-label ensemble problem and let make these three # predictions into features that feed into an overall model. target_labels = ['casual', 'registered', 'count']",No,5,77.0 "# Strip unwanted features df_train = df.drop(remove_columns, axis=1) df_targets = df_train[target_labels] df_train = df_train.drop(target_labels, axis=1) df_submit = df_test.drop(remove_columns, axis=1) print(df_train.columns) print(df_submit.columns) df_train = pd.get_dummies(df_train) df_submit = pd.get_dummies(df_submit) print(df_train.shape[1] == df_submit.shape[1]) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() df_train = scaler.fit_transform(df_train) df_submit = scaler.transform(df_submit) np_train = np.array(df_train) np_targets = np.array(df_targets) np_submit = np.array(df_submit) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(np_train, np_targets, test_size=0.15)",Yes,3,13.0 "print(X_train.shape) print(y_train.shape)",No,5,58.0 "def rmsle(y_true,y_pred): assert len(y_true) == len(y_pred) return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5",No,5,84.0 "from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(n_estimators=5, n_jobs=-1) rf.fit(X_train, y_train) y_pred = rf.predict(X_test) y_pred[y_pred < 0] = 1 print('RMSLE of predicting total count: %.4f' % rmsle(y_test[:,2], y_pred[:,2])) print('RMSLE combining casual and registered predictions: %.4f' % rmsle(y_test[:,2], np.sum(y_pred[:,0:2], axis=1)))",No,4,49.0 "b""count0 = 0\ncount1 = 0\nfor (a,b) in zip(np.sum(y_pred[:,0:2].astype(int), axis=1), y_pred[:,2].astype(int)):\n #print(a, b)\n if abs(a - b) == 0:\n count0 +=1\n if abs(a - b) <= 1:\n count1 +=1\nprint('Exact: %d' % count0)\nprint('Within one: %d ' % count1)\nprint('Total: %d ' % y_pred.shape[0])\nprint('Sum of registered and casual rider predictions is exactly the total count \\nprediction %d%% of the time and within one 100%% of the time.' % int((count0 / y_pred.shape[0])*100))""",No,5,53.0 "import lightgbm as lgb X_t, X_e, y_t, y_e = train_test_split(X_train, y_train[:,2], test_size=0.15) print(y_t.shape) print(y_e.shape) lgb_train = lgb.Dataset(X_t, y_t) lgb_eval = lgb.Dataset(X_e, y_e, reference=lgb_train) params = { 'objective': 'regression', 'metric': 'l2_root', 'num_leaves': 43, 'max_depth': 16 } gbm = lgb.train(params, lgb_train, valid_sets=lgb_eval, verbose_eval=0, early_stopping_rounds=5 ) y_pred = gbm.predict(X_test) y_pred[y_pred < 0] = 1 rmsle(y_pred, y_test[:,2])",Yes,3,49.0 "submission = pd.read_csv('../input/sampleSubmission.csv') submission['count'] = np.array(rf.predict(df_submit))[:,2] print(submission.head()) submission.to_csv('submission.csv', index=False)",No,3,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np# linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory import matplotlib as plt import os #print(os.listdir(""../input"")) # Any results you write to the current directory are saved as output.",No,5,22.0 "df=pd.read_csv(""../input/train.csv"") df.head() print (df.shape) testdf=pd.read_csv(""../input/test.csv"") print (testdf.shape) testdf.head()",No,4,45.0 "def missingvalues(df): miss=df.isnull().sum() misspercent=100*df.isnull().sum()/len(df) misvaltable=pd.concat([miss,misspercent],axis=1) misvaltable=misvaltable.rename(columns={0:""missing values"",1:""missing percent""}) return misvaltable df.dtypes.value_counts() ",No,4,37.0 "categoryweather=df.groupby(""holiday"").nunique() print(categoryweather) df1=pd.get_dummies(df['weather']) df1.head()'",No,3,20.0 "import matplotlib.pyplot as plt df.head()",No,4,41.0 "plt.figure(figsize=(20,20)) plt.subplot(4,2,1) plt.hist(df[""season""]) plt.xlabel(""season"") plt.ylabel(""count"") plt.subplot(4,2,2) plt.hist(df[""holiday""]) plt.xlabel(""holiday"") plt.ylabel(""count"") plt.subplot(4,2,3) plt.hist(df[""workingday""]) plt.xlabel(""workingday"") plt.ylabel(""count"") plt.subplot(4,2,4) plt.hist(df[""weather""]) plt.xlabel(""weather"") plt.ylabel(""count"") plt.subplot(4,2,5) plt.hist(df[""temp""]) plt.xlabel(""temp"") plt.ylabel(""count"") plt.subplot(4,2,6) plt.hist(df[""atemp""]) plt.xlabel(""atemp"") plt.ylabel(""count"") plt.subplot(4,2,7) plt.hist(df[""humidity""]) plt.xlabel(""humidity"") plt.ylabel(""count"") plt.subplot(4,2,8) plt.hist(df[""windspeed""]) plt.xlabel(""windspeed"") plt.ylabel(""count"") plt.show()",No,5,33.0 testdf.columns,No,5,71.0 "l=[ 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed'] dftarget=df[[""casual"",""registered"",""count""]].copy() dftarget.head() '",No,4,21.0 "dfnew=df.copy() dfnew.drop([""registered"",""casual"",""count""],axis=1,inplace=True) dfnew.head()",No,4,10.0 "import matplotlib.pyplot as plt df2=pd.concat([dfnew,testdf]) print(df2.shape) print(df.shape)",No,4,11.0 "import matplotlib.pyplot as plt1 plt1.figure(figsize=(20,20)) plt1.subplot(4,2,1) plt1.hist(df2[""season""]) plt1.xlabel(""season"") plt1.ylabel(""count"") plt1.subplot(4,2,2) plt1.hist(df2[""holiday""]) plt1.xlabel(""holiday"") plt1.ylabel(""count"") plt1.subplot(4,2,3) plt1.hist(df2[""workingday""]) plt1.xlabel(""workingday"") plt1.ylabel(""count"") plt1.subplot(4,2,4) plt1.hist(df2[""weather""]) plt1.xlabel(""weather"") plt1.ylabel(""count"") plt1.subplot(4,2,5) plt1.hist(df2[""temp""]) plt1.xlabel(""temp"") plt1.ylabel(""count"") plt1.subplot(4,2,6) plt1.hist(df2[""atemp""]) plt1.xlabel(""atemp"") plt1.ylabel(""count"") plt1.subplot(4,2,7) plt1.hist(df2[""humidity""]) plt1.xlabel(""humidity"") plt1.ylabel(""count"") plt1.subplot(4,2,8) plt1.hist(df2[""windspeed""]) plt1.xlabel(""windspeed"") plt1.ylabel(""count"") plt1.show()",No,5,33.0 "df=pd.read_csv(""../input/train.csv"") df1=pd.get_dummies(df['weather']) df1=df1.rename(columns={1:""clear"",2:""misty"",3:""snow"",4:""heavy snow""}) df=df.drop([""weather""],axis=1) df=pd.concat([df,df1],axis=1) df.head()'",Yes,4,45.0 "df1=pd.get_dummies(df2[""weather""]) df2.drop([""weather""],axis=1,inplace=True) df2=pd.concat([df2,df1],axis=1) df2.head() ",Yes,4,10.0 "#df2=df2.drop([""weather""],axis=1) df2=df2.rename(columns={1:""clear"",2:""misty"",3:""snow"",4:""heavy snow""}) df2.head()",No,4,61.0 "df1=pd.get_dummies(df2[""season""]) #df2=df2.drop([""season""],axis=1,inplace=True) #df1.head() df3=pd.concat([df2,df1],axis=1) #df2.head() df3=df3.rename(columns={1:""spring"",2:""summer"",3:""fall"",4:""winter""}) df3.head()",Yes,4,11.0 df3.shape,No,5,58.0 "df.shape df0=pd.get_dummies(df[""season""]) df4=pd.concat([df,df0],axis=1) df4=df4.rename(columns={1:""spring"",2:""summer"",3:""fall"",4:""winter""}) df4.head()",Yes,4,11.0 "df4.groupby(""spring"").describe()",No,5,40.0 "df4.groupby(""fall"")[""registered""].describe()",No,5,40.0 "df4[""weekend""]=[abs(1-abs(x-y)) for x,y in zip(df4[""workingday""],df4[""holiday""]) ] df4.head()",No,4,41.0 "print(df4.groupby(""weekend"")[""datetime""].nunique()) print(df4.groupby(""holiday"")[""datetime""].nunique()) print(df4.groupby(""workingday"")[""datetime""].nunique())",No,5,54.0 "df4.groupby(""weekend"")['registered',""count"",""casual""].describe()'",No,5,40.0 "df3[""weekend""]=[abs(1-abs(x-y)) for x,y in zip(df3[""workingday""],df3[""holiday""]) ]",No,5,8.0 df3.head(),No,5,41.0 df4.head(25),No,5,41.0 "df4[""time""]=pd.to_datetime(df4[""datetime""]) df4.head()",No,4,16.0 "df4[""hours""]=df4[""time""].dt.hour df4.head()",No,5,8.0 "df3[""time""]=pd.to_datetime(df3[""datetime""]) df3.head()",No,5,16.0 "df3[""hours""]=df3[""time""].dt.hour df3.head()",No,4,8.0 "df4.drop([""time""],axis=1,inplace=True) df3.drop([""time""],axis=1,inplace=True)",No,5,10.0 df4.head(),No,5,41.0 "import seaborn as sns plt.figure(figsize=(20,20)) ax=plt.subplot(221) sns.boxplot(data=df4,x=""hours"",y=""registered"",ax=ax) ax=plt.subplot(222) sns.boxplot(data=df4,x=""hours"",y=""casual"",ax=ax) ax=plt.subplot(223) sns.boxplot(data=df4,x=""hours"",y=""count"",ax=ax)",No,5,75.0 "df4[""logcasual""]=np.log(df4[""casual""]+1) df4[""logcasual""]=np.log(df4[""casual""]+1) df4[""logcasual""]=np.log(df4[""casual""]+1) df4.head()",No,5,8.0 "df4[""logregistered""]=np.log(df4[""registered""]+1) df4[""logcount""]=np.log(df4[""count""]+1) df4.head()",No,4,8.0 "#inspecting hourly trend import seaborn as sns plt.figure(figsize=(20,20)) ax=plt.subplot(221) sns.boxplot(data=df4,x=""hours"",y=""logregistered"",ax=ax) ax=plt.subplot(222) sns.boxplot(data=df4,x=""hours"",y=""logcasual"",ax=ax) ax=plt.subplot(223) sns.boxplot(data=df4,x=""hours"",y=""logcount"",ax=ax) ",No,5,75.0 "df4[""day""]=df4[""time""].dt.day df4.head(25)",No,5,8.0 "df4[""day""]=df4[""time""].dt.dayofweek df4.head()",No,5,8.0 "df4[""day""]=df4[""time""].dt.dayofweek df4.head(25)",No,4,8.0 "#inspecting daily trend import seaborn as sns plt.figure(figsize=(20,20)) ax=plt.subplot(221) sns.boxplot(data=df4,x=""day"",y=""logregistered"",ax=ax) ax=plt.subplot(222) sns.boxplot(data=df4,x=""day"",y=""logcasual"",ax=ax) ax=plt.subplot(223) sns.boxplot(data=df4,x=""day"",y=""logcount"",ax=ax) ",No,5,75.0 "import seaborn as sns plt.figure(figsize=(20,20)) ax=plt.subplot(221) sns.boxplot(data=df4,x=""day"",y=""registered"",ax=ax) ax=plt.subplot(222) sns.boxplot(data=df4,x=""day"",y=""casual"",ax=ax) ax=plt.subplot(223) sns.boxplot(data=df4,x=""day"",y=""count"",ax=ax) ",No,5,75.0 "df5=df.copy() df5.drop([""holiday"",""workingday"",""season""],axis=1,inplace=True) df5.corr()",No,4,10.0 "df4[""year""]=df4['time'].dt.year df4.head()'",No,4,8.0 "plt.figure(figsize=(20,20)) ax=plt.subplot(2,2,1) sns.boxplot(data=df4,x=""year"",y='registered',ax=ax) #plt.figure(figsize=(20,20)) ax=plt.subplot(2,2,2) sns.boxplot(data=df4,x=""year"",y='casual',ax=ax) #plt.figure(figsize=(20,20)) ax=plt.subplot(2,2,3) sns.boxplot(data=df4,x=""year"",y='count',ax=ax) '",No,5,75.0 "plt.figure(figsize=(10,10)) sns.boxplot(data=df4,x=""weekend"",y=""casual"")",No,5,75.0 "df4.groupby(""weekend"")[""datetime""].nunique()",No,5,54.0 " df4[""month""]=df4[""time""].dt.month df4.head()",No,4,8.0 "#df4.drop([""year_bins""],axis=1,inplace=True) df4[""year_bin""]=""y0"" df4[""year_bin""].loc[(df4[""year""]==2011) & (df4[""month""]<=3)]=""y1"" df4[""year_bin""].loc[(df4[""year""]==2011) & (df4[""month""]>3) & (df4[""month""]<=6)]=""y2"" df4[""year_bin""].loc[(df4[""year""]==2011) & (df4[""month""]>6) & (df4[""month""]<=9)]=""y3"" df4[""year_bin""].loc[(df4[""year""]==2011) & (df4[""month""]>9) & (df4[""month""]<=12)]=""y4"" df4[""year_bin""].loc[(df4[""year""]==2012) & (df4[""month""]<=3)]=""y5"" df4[""year_bin""].loc[(df4[""year""]==2012) & (df4[""month""]>3) & (df4[""month""]<=6)]=""y6"" df4[""year_bin""].loc[(df4[""year""]==2012) & (df4[""month""]>6) & (df4[""month""]<=9)]=""y7"" df4[""year_bin""].loc[(df4[""year""]==2012) & (df4[""month""]>9) & (df4[""month""]<=12)]=""y8"" df4.groupby('year_bin')[""datetime""].nunique()'",No,4,20.0 "plt.figure(figsize=(20,20)) ax=plt.subplot(221) sns.boxplot(data=df4,x=""year_bin"",y=""casual"",ax=ax) ax=plt.subplot(222) sns.boxplot(data=df4,x=""year_bin"",y=""registered"",ax=ax) ax=plt.subplot(223) sns.boxplot(data=df4,x=""year_bin"",y=""count"",ax=ax)",No,5,33.0 "from sklearn.cross_validation import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score from sklearn import tree",No,5,22.0 "from sklearn.tree import DecisionTreeRegressor l=[""hours""] X=df4[l] Y=df4[""casual""] dtree=DecisionTreeRegressor(max_depth=3) dtree.fit(X,Y)",No,5,7.0 "from sklearn.externals.six import StringIO from IPython.display import Image from sklearn.tree import export_graphviz import graphviz data = export_graphviz(dtree,out_file=None, filled=True, rounded=True, special_characters=True) graph = graphviz.Source(data) graph",Yes,4,22.0 "df4[""casual""].describe()",No,5,40.0 "l=[""hours""] X=df4[l] Y=df4[""registered""] dtree1=DecisionTreeRegressor(max_depth=4) dtree1.fit(X,Y)",No,5,7.0 "data = export_graphviz(dtree1,out_file=None, filled=True, rounded=True, special_characters=True) graph = graphviz.Source(data) graph",No,5,84.0 "df4[""daycas""]=""cas0"" df4[""daycas""].loc[df4[""hours""]<=6.5]=""cas1"" df4[""daycas""].loc[(df4[""hours""]>6.5) & (df4[""hours""]<=7.5)]=""cas2"" df4[""daycas""].loc[(df4[""hours""]>7.5) & (df4[""hours""]<=8.5)]=""cas3"" df4[""daycas""].loc[(df4[""hours""]>8.5) & (df4[""hours""]<=9.5)]=""cas4"" df4[""daycas""].loc[(df4[""hours""]>9.5) & (df4[""hours""]<=10.5)]=""cas5"" df4[""daycas""].loc[(df4[""hours""]>10.5) & (df4[""hours""]<=19.5)]=""cas6"" df4[""daycas""].loc[(df4[""hours""]>19.5) & (df4[""hours""]<=21.5)]=""cas7"" df4[""daycas""].loc[df4[""hours""]>21.5]=""cas8""",No,5,20.0 "df4.groupby(""daycas"")[""datetime""].nunique()",No,5,54.0 "df4[""dayreg""]=""reg0"" df4[""dayreg""].loc[df4[""hours""]<=0.5]=""reg1"" df4[""dayreg""].loc[(df4[""hours""]>0.5) & (df4[""hours""]<=1.5)]=""reg2"" df4[""dayreg""].loc[(df4[""hours""]>1.5) & (df4[""hours""]<=4.5)]=""reg3"" df4[""dayreg""].loc[(df4[""hours""]>4.5) & (df4[""hours""]<=5.5)]=""reg4"" df4[""dayreg""].loc[(df4[""hours""]>5.5) & (df4[""hours""]<=6.5)]=""reg5"" df4[""dayreg""].loc[(df4[""hours""]>6.5) & (df4[""hours""]<=8.5)]=""reg6"" df4[""dayreg""].loc[(df4[""hours""]>8.5) & (df4[""hours""]<=16.5)]=""reg7"" df4[""dayreg""].loc[(df4[""hours""]>16.5) & (df4[""hours""]<=18.5)]=""reg8"" df4[""dayreg""].loc[(df4[""hours""]>18.5) & (df4[""hours""]<=20.5)]=""reg9"" df4[""dayreg""].loc[(df4[""hours""]>20.5) & (df4[""hours""]<=21.5)]=""reg10"" df4[""dayreg""].loc[(df4[""hours""]>21.5) & (df4[""hours""]<=22.5)]=""reg11"" df4[""dayreg""].loc[df4[""hours""]>22.5]=""reg12"" df4.groupby(""dayreg"")[""datetime""].nunique()",No,4,20.0 "df4.head() ",No,5,41.0 "df3[""time""]=pd.to_datetime(df3[""datetime""]) df3[""year""]=df3['time'].dt.year df3[""month""]=df3[""time""].dt.month df3[""day""]=df3[""time""].dt.dayofweek df3.head()'",No,4,8.0 "df3[""year_bin""]=""y0"" df3[""year_bin""].loc[(df3[""year""]==2011) & (df3[""month""]<=3)]=""y1"" df3[""year_bin""].loc[(df3[""year""]==2011) & (df3[""month""]>3) & (df3[""month""]<=6)]=""y2"" df3[""year_bin""].loc[(df3[""year""]==2011) & (df3[""month""]>6) & (df3[""month""]<=9)]=""y3"" df3[""year_bin""].loc[(df3[""year""]==2011) & (df3[""month""]>9) & (df3[""month""]<=12)]=""y4"" df3[""year_bin""].loc[(df3[""year""]==2012) & (df3[""month""]<=3)]=""y5"" df3[""year_bin""].loc[(df3[""year""]==2012) & (df3[""month""]>3) & (df3[""month""]<=6)]=""y6"" df3[""year_bin""].loc[(df3[""year""]==2012) & (df3[""month""]>6) & (df3[""month""]<=9)]=""y7"" df3[""year_bin""].loc[(df3[""year""]==2012) & (df3[""month""]>9) & (df3[""month""]<=12)]=""y8"" df3[""daycas""]=""cas0"" df3[""daycas""].loc[df3[""hours""]<=6.5]=""cas1"" df3[""daycas""].loc[(df3[""hours""]>6.5) & (df3[""hours""]<=7.5)]=""cas2"" df3[""daycas""].loc[(df3[""hours""]>7.5) & (df3[""hours""]<=8.5)]=""cas3"" df3[""daycas""].loc[(df3[""hours""]>8.5) & (df3[""hours""]<=9.5)]=""cas4"" df3[""daycas""].loc[(df3[""hours""]>9.5) & (df3[""hours""]<=10.5)]=""cas5"" df3[""daycas""].loc[(df3[""hours""]>10.5) & (df3[""hours""]<=19.5)]=""cas6"" df3[""daycas""].loc[(df3[""hours""]>19.5) & (df3[""hours""]<=21.5)]=""cas7"" df3[""daycas""].loc[df3[""hours""]>21.5]=""cas8"" df3[""dayreg""]=""reg0"" df3[""dayreg""].loc[df3[""hours""]<=0.5]=""reg1"" df3[""dayreg""].loc[(df3[""hours""]>0.5) & (df3[""hours""]<=1.5)]=""reg2"" df3[""dayreg""].loc[(df3[""hours""]>1.5) & (df3[""hours""]<=4.5)]=""reg3"" df3[""dayreg""].loc[(df3[""hours""]>4.5) & (df3[""hours""]<=5.5)]=""reg4"" df3[""dayreg""].loc[(df3[""hours""]>5.5) & (df3[""hours""]<=6.5)]=""reg5"" df3[""dayreg""].loc[(df3[""hours""]>6.5) & (df3[""hours""]<=8.5)]=""reg6"" df3[""dayreg""].loc[(df3[""hours""]>8.5) & (df3[""hours""]<=16.5)]=""reg7"" df3[""dayreg""].loc[(df3[""hours""]>16.5) & (df3[""hours""]<=18.5)]=""reg8"" df3[""dayreg""].loc[(df3[""hours""]>18.5) & (df3[""hours""]<=20.5)]=""reg9"" df3[""dayreg""].loc[(df3[""hours""]>20.5) & (df3[""hours""]<=21.5)]=""reg10"" df3[""dayreg""].loc[(df3[""hours""]>21.5) & (df3[""hours""]<=22.5)]=""reg11"" df3[""dayreg""].loc[df3[""hours""]>22.5]=""reg12"" df3.head()",No,5,20.0 "df6=df4.copy() df6.drop([""datetime"",""season"",""time"",""count"",""registered"",""casual"",""logcount""],axis=1,inplace=True) df6.head()",No,4,10.0 "df4[""rtemp1""]=df4[""temp""]+df4[""atemp""] df4[""rtemp2""]=df4[""temp""]-df4[""atemp""] df4[""rtemp3""]=df4[""temp""]*df4[""atemp""] print(df4[""temp""].corr(df4[""registered""])) print(df4[""rtemp1""].corr(df4[""registered""])) print(df4[""rtemp2""].corr(df4[""registered""])) print(df4[""rtemp3""].corr(df4[""registered""])) df3[""rtemp3""]=df3[""temp""]*df3[""atemp""]",No,5,8.0 "df3.drop([""datetime"",""season"",""time""],axis=1,inplace=True)",No,5,10.0 "from sklearn.cross_validation import train_test_split from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor from sklearn.metrics import mean_squared_error",No,5,22.0 "df6.head() ",No,5,41.0 "df7=df6.logcasual df8=df6.logregistered",No,5,8.0 "df6.drop([""logcasual"",'logregistered'],axis=1,inplace=True) '",No,5,10.0 "df6.drop([""atemp""],axis=1,inplace=True)",No,5,10.0 "df6.drop([""temp""],axis=1,inplace=True)",No,5,10.0 "df6.drop([""rtemp3"",""month""],axis=1,inplace=True) d=pd.read_csv(""../input/train.csv"") df6[""temp""]=d[""temp""] var=[""holiday"",""workingday"",""weekend"",""hours""] for v in var: df6[v]=df6[v].astype(""category"") ",No,3,10.0 "from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test=train_test_split(dfreg,df7,random_state=42) rf=RandomForestRegressor(n_estimators=500) rf.fit(x_train,y_train) predictions=rf.predict(x_test) mean_squared_error(y_test, predictions)",Yes,4,20.0 "rf1=RandomForestRegressor(n_estimators=500) rf1.fit(dfcas,df7) rf2=RandomForestRegressor(n_estimators=500) rf2.fit(dfreg,df8)",Yes,4,49.0 df6.head(),No,5,41.0 "df3.drop([""temp"",""atemp""],axis=1,inplace=True)",No,5,10.0 "print(df3.shape) print(df6.shape)",No,5,58.0 "newtest=df3.tail(17379-10886) newtest.head()",No,5,41.0 testdf.head(),No,5,41.0 newtest.head(),No,5,41.0 "print(newtest.shape) print(testdf.shape)",No,5,58.0 "newtest.drop([""rtemp3"",""month""],axis=1,inplace=True) f=pd.read_csv(""../input/test.csv"") newtest['year']=newtest.year.replace({2011:0,2012:1}) newtest[""temp""]=f[""temp""] var=[""holiday"",""workingday"",""weekend"",""hours""] for v in var: newtest[v]=newtest[v].astype(""category"") newtestreg=newtest.copy() newtestcas=newtest.copy() newtestreg.drop([""daycas""],axis=1,inplace=True) newreg1=pd.get_dummies(newtestreg[""dayreg""]) newtestreg=pd.concat([newtestreg,newreg1],axis=1) newreg2=pd.get_dummies(newtestreg[""year_bin""]) newtestreg=pd.concat([newtestreg,newreg2],axis=1) newreg3=pd.get_dummies(newtestreg[""day""]) newtestreg=pd.concat([newtestreg,newreg3],axis=1) newtestreg.drop([""dayreg"",""year_bin"",""day""],axis=1,inplace=True) newtestcas.drop([""dayreg""],axis=1,inplace=True) newcas1=pd.get_dummies(newtestcas[""daycas""]) newtestcas=pd.concat([newtestcas,newcas1],axis=1) newcas2=pd.get_dummies(newtestcas[""year_bin""]) newtestcas=pd.concat([newtestcas,newcas2],axis=1) newcas3=pd.get_dummies(newtestcas[""day""]) newtestcas=pd.concat([newtestcas,newcas3],axis=1) newtestcas.drop([""daycas"",""year_bin"",""day""],axis=1,inplace=True) newtest.head() '",No,2,58.0 "predictcas=rf1.predict(newtestcas) predictcas=np.exp(predictcas)-1 predictreg=rf2.predict(newtestreg) predictreg=np.exp(predictreg)-1 ",Yes,4,20.0 print(type(predictcas)),No,5,70.0 "dfcas.head() ",No,5,41.0 import os ,No,5,22.0 "df = pd.read_csv('../input/train.csv', parse_dates=[0])",No,5,45.0 "test = pd.read_csv('../input/test.csv', parse_dates=[0])",No,5,45.0 df_all['hour'] = df['datetime'].dt.hour,No,5,8.0 import numpy as np,No,5,22.0 "df_all['count'] = np.log(df_all['count'] + 1) df_all['registered'] = np.log(df_all['registered'] + 1) df_all['casual'] = np.log(df_all['casual'] + 1)",No,5,8.0 "df_all.shape, df.shape, test.shape",No,5,58.0 df_all.shape,No,5,58.0 "from fastai.imports import * from fastai.structured import *",No,5,22.0 df_all.info(),No,5,40.0 "df = df_all[~df_all['count'].isnull()] test = df_all[df_all['count'].isnull()]",No,5,13.0 "df.shape, test.shape",No,5,58.0 train = df[df['datetimeDay'] <= 15],No,5,14.0 valid = df[df['datetimeDay'] > 15],No,5,14.0 "train.shape, valid.shape",No,5,58.0 "feats = [c for c in df.columns if c not in ['casual', 'registered', 'count']]",No,5,77.0 "feats = ['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'datetimeDayofweek', 'hour', 'datetimeYear']",No,5,77.0 "rf.fit(train[feats], train['count'])",No,5,7.0 rf.predict(valid[feats]),No,5,48.0 from sklearn.metrics import mean_squared_error,No,5,22.0 "mean_squared_error(valid['count'], rf.predict(valid[feats])) ** (1/2)",No,5,49.0 "pd.Series(rf.feature_importances_, index=feats).sort_values().plot.barh()",No,5,79.0 real_test_predictions.min(),No,2,40.0 "submission = pd.DataFrame({ ""datetime"": real_test_data_ids, ""count"": [max(0, x) for x in real_test_predictions] }) submission.head()",Yes,3,41.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt import os RANDOM_STATE = 31415",No,5,77.0 "cols = df_train.columns.tolist() cols",No,5,71.0 "from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit_transform(df_train)",Yes,4,7.0 "x = df_train.drop(['Cover_Type'],axis = 1) y = df_train['Cover_Type']",No,5,21.0 "x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=42)",No,5,13.0 "xgb = XGBClassifier() xgb.fit(x_train,y_train)",No,5,7.0 "predics =xgb.predict(x_test) predics ",No,5,48.0 "accuracy_score(y_test,predics)",No,5,49.0 "df_test['Cover_Type'] = xgb.predict(df_test) df_test['Cover_Type']",No,5,48.0 "my_submission = pd.DataFrame({'Id':test_id,'Cover_Type': df_test['Cover_Type']}) my_submission.to_csv('submission.csv', index=False)",No,5,25.0 my_submission.to_csv(r'my_submission.csv'),No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from pandas.tseries.frequencies import to_offset #Set the frequency in the index # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory import os #print(os.listdir(""../input"")) # Any results you write to the current directory are saved as output.",No,5,22.0 "#Import the files and set the index X_train = pd.read_csv('../input/train.csv') X_test = pd.read_csv('../input/test.csv') X_train = return_set_index(X_train) X_test = return_set_index(X_test) ",No,5,45.0 "#We will devide the train dataset in seasons and working days vs non working days. The idea is to group the results by hour and observe if the ratio #mean standard deviation for each group is better or not for the overall group. X_workingday_1 = X_train[(X_train.workingday == 1) & (X_train.season == 1)] X_holiday_1 = X_train[(X_train.workingday == 0) & (X_train.season == 1)] X_total_1 = X_train[(X_train.season == 1)] X_workingday_2 = X_train[(X_train.workingday == 1) & (X_train.season == 2)] X_holiday_2 = X_train[(X_train.workingday == 0) & (X_train.season == 2)] X_total_2 = X_train[(X_train.season == 2)] X_workingday_3 = X_train[(X_train.workingday == 1) & (X_train.season == 3)] X_holiday_3 = X_train[(X_train.workingday == 0) & (X_train.season == 3)] X_total_3 = X_train[(X_train.season == 3)] X_workingday_4 = X_train[(X_train.workingday == 1) & (X_train.season == 4)] X_holiday_4 = X_train[(X_train.workingday == 0) & (X_train.season == 4)] X_total_4 = X_train[(X_train.season == 4)]",No,5,14.0 "X_final.reset_index(level=0, inplace=True) X_final.to_csv('result.csv', index=False)",No,5,25.0 "import calendar import os import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from datetime import datetime from scipy import stats %matplotlib inline sns.set()",No,5,23.0 "drop_lst = ['casual', 'registered'] df = df.drop(drop_lst, axis=1) df.head()",No,4,10.0 df['count'].head(),No,5,41.0 plt.hist(df['count']);,No,5,33.0 "count_log = np.log(df['count']) plt.hist(count_log);",No,5,33.0 plt.hist(count_boxcox);,No,5,33.0 "df['count_log'] = count_log df['count_boxcox'] = count_boxcox",No,5,8.0 df['datetime'] = pd.to_datetime(df['datetime']),No,5,16.0 "df['dow'] = df['datetime'].dt.dayofweek df.head()",No,5,8.0 "df['month'] = df['datetime'].dt.month df.head()",No,4,8.0 "df['week'] = df['datetime'].dt.week df.head()",No,5,8.0 "df['hour'] = df['datetime'].dt.hour df.head()",No,4,8.0 "df['year'] = df['datetime'].dt.year df.head()",No,4,8.0 "df['day'] = df['datetime'].dt.day df.head()",No,4,8.0 "df = df.drop(labels='datetime', axis=1) df.head()",No,4,10.0 "fig, ax = plt.subplots(1, 2, figsize=(12, 4)) names = ['1', '2', '3', '4'] values = df['season'][df['year'] == 2011].value_counts() ax[0].bar(names, values) values = df['season'][df['year'] == 2012].value_counts() ax[1].bar(names, values) fig.suptitle('Seasons in 2011 & 2012');",No,5,33.0 "spring_2011 = int(df['season'][df['season'] == 1][df['year'] == 2011].value_counts()) summer_2011 = int(df['season'][df['season'] == 2][df['year'] == 2011].value_counts()) fall_2011 = int(df['season'][df['season'] == 3][df['year'] == 2011].value_counts()) winter_2011 = int(df['season'][df['season'] == 4][df['year'] == 2011].value_counts()) spring_2012 = int(df['season'][df['season'] == 1][df['year'] == 2012].value_counts()) summer_2012 = int(df['season'][df['season'] == 2][df['year'] == 2012].value_counts()) fall_2012 = int(df['season'][df['season'] == 3][df['year'] == 2012].value_counts()) winter_2012 =int(df['season'][df['season'] == 4][df['year'] == 2012].value_counts()) print(""Spring 2011: {}"".format(spring_2011)) print(""Summer 2011: {}"".format(summer_2011)) print(""Fall 2011: {}"".format(fall_2011)) print(""Winter 2011: {}"".format(winter_2011)) print(""-----------------------------------------"") print(""Spring 2012: {}"".format(spring_2012)) print(""Summer 2012: {}"".format(summer_2012)) print(""Fall 2012: {}"".format(fall_2012)) print(""Winter 2012: {}"".format(winter_2012))'",No,5,72.0 "fig, ax = plt.subplots(1, 2, figsize=(12, 4)) names = ['0', '1'] values = df['holiday'][df['year'] == 2011].value_counts() ax[0].bar(names, values) values = df['holiday'][df['year'] == 2012].value_counts() ax[1].bar(names, values) fig.suptitle('Holidays in 2011 & 2012');",No,5,33.0 "# metric to optimize from sklearn.metrics import mean_squared_error from sklearn.metrics import make_scorer scorer = make_scorer(lambda y_test, predictions: np.sqrt(mean_squared_error(y_test, predictions)))",Yes,5,84.0 "no_holiday_2011 = int(df['holiday'][df['holiday'] == 0][df['year'] == 2011].value_counts()) holiday_2011 = int(df['holiday'][df['holiday'] == 1][df['year'] == 2011].value_counts()) no_holiday_2012 = int(df['holiday'][df['holiday'] == 0][df['year'] == 2012].value_counts()) holiday_2012 = int(df['holiday'][df['holiday'] == 1][df['year'] == 2012].value_counts()) print(""No Holidays 2011: {}"".format(no_holiday_2011)) print(""No Holidays 2012: {}"".format(no_holiday_2012)) print(""Holidays 2011: {}"".format(holiday_2011)) print(""Holidays 2012: {}"".format(holiday_2012)) print('----------------') total_2011 = no_holiday_2011 + holiday_2011 total_2012 = no_holiday_2012 + holiday_2012 print('No Holidays 2011: {:.0f}%'.format(no_holiday_2011 / total_2011 * 100)) print('No Holidays 2012: {:.0f}%'.format(no_holiday_2012 / total_2012 * 100))'",No,5,72.0 training_set = pd.read_csv('../input/train.csv'),No,5,45.0 training_set.head(),No,5,41.0 "fig, ax = plt.subplots(1, 2, figsize=(12, 4)) names = ['0', '1'] values = df['workingday'][df['year'] == 2011].value_counts() ax[0].bar(names, values) values = df['workingday'][df['year'] == 2012].value_counts() ax[1].bar(names, values) fig.suptitle('Working day in 2011 & 2012');",No,4,33.0 "no_workingday_2011 = int(df['workingday'][df['workingday'] == 0][df['year'] == 2011].value_counts()) workingday_2011 = int(df['workingday'][df['workingday'] == 1][df['year'] == 2011].value_counts()) no_workingday_2012 = int(df['workingday'][df['workingday'] == 0][df['year'] == 2012].value_counts()) workingday_2012 = int(df['workingday'][df['workingday'] == 1][df['year'] == 2012].value_counts()) print(""No working day 2011: {}"".format(no_workingday_2011)) print(""working day 2011: {}"".format(workingday_2011)) print(""No working day 2012: {}"".format(no_workingday_2012)) print(""working day 2012: {}"".format(workingday_2012)) print('----------------') total_2011 = no_workingday_2011 + workingday_2011 total_2012 = no_workingday_2012 + workingday_2012 print('No working day 2011: {:.0f}%'.format(no_workingday_2011 / total_2011 * 100)) print('No working day 2012: {:.0f}%'.format(no_workingday_2012 / total_2012 * 100))'",No,5,72.0 "fig, ax = plt.subplots(1, 2, figsize=(12, 4)) names_2011 = ['1', '2', '3'] names_2012 = ['1', '2', '3', '4'] values = df['weather'][df['year'] == 2011].value_counts() ax[0].bar(names_2011, values) values = df['weather'][df['year'] == 2012].value_counts() ax[1].bar(names_2012, values) fig.suptitle('Weather in 2011 & 2012');",No,5,33.0 "weather_2011_1 = df['weather'][df['weather'] == 1][df['year'] == 2011].value_counts() weather_2011_2 = df['weather'][df['weather'] == 2][df['year'] == 2011].value_counts() weather_2011_3 = df['weather'][df['weather'] == 3][df['year'] == 2011].value_counts() weather_2012_1 = df['weather'][df['weather'] == 1][df['year'] == 2012].value_counts() weather_2012_2 = df['weather'][df['weather'] == 2][df['year'] == 2012].value_counts() weather_2012_3 = df['weather'][df['weather'] == 3][df['year'] == 2012].value_counts() weather_2012_4 = df['weather'][df['weather'] == 4][df['year'] == 2012].value_counts() print('weather_1 in 2011: {}'.format(int(weather_2011_1))) print('weather_2 in 2011: {}'.format(int(weather_2011_2))) print('weather_3 in 2011: {}'.format(int(weather_2011_3))) print('--------------') print('weather_1 in 2012: {}'.format(int(weather_2012_1))) print('weather_2 in 2012: {}'.format(int(weather_2012_2))) print('weather_3 in 2012: {}'.format(int(weather_2012_3))) print('weather_4 in 2012: {}'.format(int(weather_2012_4))) print('---------------') total_2011 = int(weather_2011_1) + int(weather_2011_2) + int(weather_2011_3) total_2012 = int(weather_2012_1) + int(weather_2012_2) + int(weather_2012_3) + int(weather_2012_4) print('weather_1 in 2011: {:.0f}%'.format(int(weather_2011_1) / int(total_2011) * 100)) print('weather_2 in 2011: {:.0f}%'.format(int(weather_2011_2) / int(total_2011) * 100)) print('weather_3 in 2011: {:.0f}%'.format(int(weather_2011_3) / int(total_2011) * 100)) print('--------------') print('weather_1 in 2012: {:.0f}%'.format(int(weather_2012_1) / int(total_2012) * 100)) print('weather_2 in 2012: {:.0f}%'.format(int(weather_2012_2) / int(total_2012) * 100)) print('weather_3 in 2012: {:.0f}%'.format(int(weather_2012_3) / int(total_2012) * 100)) print('weather_4 in 2012: {:.0f}%'.format(int(weather_2012_4) / int(total_2012) * 100))",No,5,72.0 "plt.hist(df['temp'][df['year'] == 2011], alpha=0.5, label='2011') plt.hist(df['temp'][df['year'] == 2012], alpha=0.5, label='2012') plt.legend(loc='upper right');",No,5,33.0 "training_set.plot(x = 'datetime', y = 'casual')",No,5,75.0 "training_set.plot(x = 'datetime', y = 'registered')",No,5,75.0 "plt.hist(df['atemp'][df['year'] == 2011], alpha=0.5, label='2011') plt.hist(df['atemp'][df['year'] == 2012], alpha=0.5, label='2012') plt.legend(loc='upper right');",No,5,33.0 "plt.hist(df['humidity'][df['year'] == 2011], alpha=0.5, label='2011') plt.hist(df['humidity'][df['year'] == 2012], alpha=0.5, label='2012') plt.legend(loc='upper right');",No,5,33.0 "plt.hist(df['windspeed'][df['year'] == 2011], alpha=0.5, label='2011') plt.hist(df['windspeed'][df['year'] == 2012], alpha=0.5, label='2012') plt.legend(loc='upper right');",No,5,33.0 "plt.hist(df['dow'][df['year'] == 2011], alpha=0.5, label='2011', bins=7) plt.hist(df['dow'][df['year'] == 2012], alpha=0.5, label='2012', bins=7) plt.legend(loc='upper right');",No,5,33.0 "plt.hist(df['month'][df['year'] == 2011], alpha=0.5, label='2011', bins=12) plt.hist(df['month'][df['year'] == 2012], alpha=0.5, label='2012', bins=12) plt.legend(loc='upper right');",No,5,75.0 "plt.hist(df['week'][df['year'] == 2011], alpha=0.5, label='2011', bins=52) plt.hist(df['week'][df['year'] == 2012], alpha=0.5, label='2012', bins=52) fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.legend(loc='upper right');",No,5,33.0 "plt.hist(df['hour'][df['year'] == 2011], alpha=0.5, label='2011', bins=24) plt.hist(df['hour'][df['year'] == 2012], alpha=0.5, label='2012', bins=24) plt.legend(loc='upper right');",No,5,33.0 "plt.hist(df['day'][df['year'] == 2011], alpha=0.5, label='2011', bins=31) plt.hist(df['day'][df['year'] == 2012], alpha=0.5, label='2012', bins=31) fig = plt.gcf() fig.set_size_inches(18.5, 10.5) plt.legend(loc='upper right');",No,5,75.0 "names = ['2011', '2012'] values = df['year'].value_counts() plt.bar(names, values);",No,5,33.0 "count_2011 = df['year'][df['year'] == 2011].count() count_2012 = df['year'][df['year'] == 2012].count() print('2011: {}'.format(count_2011)) print('2012: {}'.format(count_2012))",No,5,72.0 "cor_mat = df[:].corr() mask = np.array(cor_mat) mask[np.tril_indices_from(mask)] = False fig = plt.gcf() fig.set_size_inches(30,12) sns.heatmap(data=cor_mat, mask=mask, square=True, annot=True, cbar=True);",No,5,80.0 "corr = training_set.corr() fig, ax = plt.subplots(figsize=(30, 30)) ax.matshow(corr) for (i, j), z in np.ndenumerate(corr): ax.text(j, i, '{:0.1f}'.format(z), ha='center', va='center', bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3')) plt.xticks(range(len(corr.columns)), corr.columns); plt.yticks(range(len(corr.columns)), corr.columns);",No,5,80.0 "from sklearn.model_selection import train_test_split # Basic preprocessing which applies to all regression techniques (dependent variable: casual) data = training_set.drop(columns = ['datetime', 'atemp', 'registered', 'count']) X_train, X_test, y_train, y_test = train_test_split(data, data.casual, test_size=0.2, random_state = RANDOM_STATE) X_train = X_train.drop(columns = ['casual']) X_test = X_test.drop(columns = ['casual'])",Yes,3,13.0 "# Preprocessing for linear regression from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() X_train_norm = scaler.fit_transform(X_train) X_test_norm = scaler.transform(X_test) one_hot = OneHotEncoder(categorical_features = [0, 1, 2, 3]) #season, holiday, workingday and weather X_train_norm = one_hot.fit_transform(X_train_norm) X_test_norm = one_hot.transform(X_test_norm)",Yes,4,20.0 from sklearn.linear_model import Lasso,No,5,22.0 "from sklearn.model_selection import cross_val_score casual_model = Lasso() scores = cross_val_score(casual_model, X_train_norm, y_train, cv=5, scoring = scorer) scores",Yes,5,84.0 "casual_model.fit(X_train_norm, y_train)",No,5,7.0 "# Same thing for the second variable # Basic preprocessing which applies to all regression techniques (dependent variable: casual) data = training_set.drop(columns = ['datetime', 'atemp', 'casual', 'count']) X_train, X_test, y_train, y_test = train_test_split(data, data.registered, test_size=0.2, random_state = RANDOM_STATE) X_train = X_train.drop(columns = ['registered']) X_test = X_test.drop(columns = ['registered'])",Yes,4,13.0 "# Preprocessing for linear regression from sklearn.preprocessing import OneHotEncoder from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() X_train_norm = scaler.fit_transform(X_train) X_test_norm = scaler.transform(X_test) one_hot = OneHotEncoder(categorical_features = [0, 1, 2, 3]) #season, holiday, workingday and weather X_train_norm = one_hot.fit_transform(X_train_norm) X_test_norm = one_hot.transform(X_test_norm)",Yes,4,20.0 "from sklearn.linear_model import Lasso from sklearn.model_selection import cross_val_score registered_model = Lasso() scores = cross_val_score(registered_model, X_train_norm, y_train, cv=5, scoring = scorer) scores",Yes,5,28.0 "registered_model.fit(X_train_norm, y_train)",No,5,7.0 "# Final prediction of the baseline models, as I am not going to tweak them, I will move directly to the test data test_dataset = pd.read_csv(""../input/test.csv"")",No,5,45.0 "test_data = test_dataset.drop(columns = ['datetime', 'atemp']) test_data = scaler.transform(test_data) test_data = one_hot.transform(test_data)",Yes,5,20.0 "casual = casual_model.predict(test_data) registered = registered_model.predict(test_data) total = casual + registered",No,4,48.0 test_dataset[test_dataset['count'] < 0],No,5,14.0 "test_dataset.loc[test_dataset['count'] < 0, 'count'] = 0",No,2,8.0 test_dataset[test_dataset['count'] <= 0],No,5,14.0 "test_dataset[['datetime', 'count']].to_csv('result.csv', index = False)",No,4,25.0 "# Ignore the warnings import warnings warnings.filterwarnings('always') warnings.filterwarnings('ignore') # data visualisation and manipulation import numpy as np import pandas as pd import matplotlib.pyplot as plt from matplotlib import style import seaborn as sns import missingno as msno #configure # sets matplotlib to inline and displays graphs below the corressponding cell. % matplotlib inline style.use('fivethirtyeight') sns.set(style='whitegrid',color_codes=True) #import the necessary modelling algos. #classifiaction. from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC,SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.naive_bayes import GaussianNB #regression from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor from sklearn.svm import SVR from sklearn.neighbors import KNeighborsRegressor #model selection from sklearn.model_selection import train_test_split,cross_validate from sklearn.model_selection import KFold from sklearn.model_selection import GridSearchCV #evaluation metrics from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error # for regression from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score # for classification",No,5,23.0 "train=pd.read_csv(r'../input/train.csv') test=pd.read_csv(r'../input/test.csv') df=train.copy() test_df=test.copy() df.head()",Yes,5,45.0 df.columns.unique(),No,5,57.0 df.isnull().sum(),No,5,39.0 msno.matrix(df),No,5,34.0 df.season.value_counts(),No,5,72.0 "sns.factorplot(x='season',data=df,kind='count',size=5,aspect=1.5)",No,5,33.0 "df.holiday.value_counts() sns.factorplot(x='holiday',data=df,kind='count',size=5,aspect=1)",Yes,5,33.0 "df.workingday.value_counts() sns.factorplot(x='workingday',data=df,kind='count',size=5,aspect=1)",Yes,5,33.0 "# 1-> spring # 2-> summer # 3-> fall # 4-> winter sns.factorplot(x='weather',data=df,kind='count',size=5,aspect=1)",No,5,33.0 "sns.boxplot(data=df[['temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count']]) fig=plt.gcf() fig.set_size_inches(10,10)",No,5,33.0 "df.temp.unique() fig,axes=plt.subplots(2,2) axes[0,0].hist(x=""temp"",data=df,edgecolor=""black"",linewidth=2,color='#ff4125') axes[0,0].set_title(""Variation of temp"") axes[0,1].hist(x=""atemp"",data=df,edgecolor=""black"",linewidth=2,color='#ff4125') axes[0,1].set_title(""Variation of atemp"") axes[1,0].hist(x=""windspeed"",data=df,edgecolor=""black"",linewidth=2,color='#ff4125') axes[1,0].set_title(""Variation of windspeed"") axes[1,1].hist(x=""humidity"",data=df,edgecolor=""black"",linewidth=2,color='#ff4125') axes[1,1].set_title(""Variation of humidity"") fig.set_size_inches(10,10)'",No,5,33.0 "cor_mat= df[:].corr() mask = np.array(cor_mat) mask[np.tril_indices_from(mask)] = False fig=plt.gcf() fig.set_size_inches(30,12) sns.heatmap(data=cor_mat,mask=mask,square=True,annot=True,cbar=True)",No,5,80.0 "season=pd.get_dummies(df['season'],prefix='season') df=pd.concat([df,season],axis=1) df.head() season=pd.get_dummies(test_df['season'],prefix='season') test_df=pd.concat([test_df,season],axis=1) test_df.head()",Yes,4,20.0 "weather=pd.get_dummies(df['weather'],prefix='weather') df=pd.concat([df,weather],axis=1) df.head() weather=pd.get_dummies(test_df['weather'],prefix='weather') test_df=pd.concat([test_df,weather],axis=1) test_df.head()",Yes,4,20.0 "df.drop(['season','weather'],inplace=True,axis=1) df.head() test_df.drop(['season','weather'],inplace=True,axis=1) test_df.head()",Yes,4,10.0 "df[""hour""] = [t.hour for t in pd.DatetimeIndex(df.datetime)] df[""day""] = [t.dayofweek for t in pd.DatetimeIndex(df.datetime)] df[""month""] = [t.month for t in pd.DatetimeIndex(df.datetime)] df['year'] = [t.year for t in pd.DatetimeIndex(df.datetime)] df['year'] = df['year'].map({2011:0, 2012:1}) df.head()'",Yes,4,8.0 "test_df[""hour""] = [t.hour for t in pd.DatetimeIndex(test_df.datetime)] test_df[""day""] = [t.dayofweek for t in pd.DatetimeIndex(test_df.datetime)] test_df[""month""] = [t.month for t in pd.DatetimeIndex(test_df.datetime)] test_df['year'] = [t.year for t in pd.DatetimeIndex(test_df.datetime)] test_df['year'] = test_df['year'].map({2011:0, 2012:1}) test_df.head()'",Yes,5,8.0 "df.drop('datetime',axis=1,inplace=True) df.head()",Yes,4,10.0 "df.drop(['casual','registered'],axis=1,inplace=True) df.head()",Yes,4,10.0 "sns.factorplot(x=""hour"",y=""count"",data=df,kind='bar',size=5,aspect=1.5)'",No,5,75.0 "sns.factorplot(x=""month"",y=""count"",data=df,kind='bar',size=5,aspect=1.5)'",No,5,33.0 "sns.factorplot(x=""year"",y=""count"",data=df,kind='bar',size=5,aspect=1.5)'",No,5,75.0 "sns.factorplot(x=""day"",y='count',kind='bar',data=df,size=5,aspect=1)'",No,5,33.0 "plt.scatter(x=""temp"",y=""count"",data=df,color='#ff4125')'",No,5,33.0 "new_df=df.copy() new_df.temp.describe() new_df['temp_bin']=np.floor(new_df['temp'])//5 new_df['temp_bin'].unique() # now we can visualize as follows sns.factorplot(x=""temp_bin"",y=""count"",data=new_df,kind='bar')'",No,4,33.0 df.columns.to_series().groupby(df.dtypes).groups,No,3,40.0 "x_train,x_test,y_train,y_test=train_test_split(df.drop('count',axis=1),df['count'],test_size=0.25,random_state=42)",No,4,13.0 "models=[RandomForestRegressor(),AdaBoostRegressor(),BaggingRegressor(),SVR(),KNeighborsRegressor()] model_names=['RandomForestRegressor','AdaBoostRegressor','BaggingRegressor','SVR','KNeighborsRegressor'] rmsle=[] d={} for model in range (len(models)): clf=models[model] clf.fit(x_train,y_train) test_pred=clf.predict(x_test) rmsle.append(np.sqrt(mean_squared_log_error(test_pred,y_test))) d={'Modelling Algo':model_names,'RMSLE':rmsle} d",Yes,3,7.0 "rmsle_frame=pd.DataFrame(d) rmsle_frame",No,4,12.0 "sns.factorplot(y='Modelling Algo',x='RMSLE',data=rmsle_frame,kind='bar',size=5,aspect=2)",No,4,33.0 "sns.factorplot(x='Modelling Algo',y='RMSLE',data=rmsle_frame,kind='point',size=5,aspect=2)",No,4,33.0 "no_of_test=[500] params_dict={'n_estimators':no_of_test,'n_jobs':[-1],'max_features':[""auto"",'sqrt','log2']} clf_rf=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params_dict,scoring='neg_mean_squared_log_error') clf_rf.fit(x_train,y_train) pred=clf_rf.predict(x_test) print((np.sqrt(mean_squared_log_error(pred,y_test))))'",Yes,3,6.0 clf_rf.best_params_,No,5,2.0 "n_neighbors=[] for i in range (0,50,5): if(i!=0): n_neighbors.append(i) params_dict={'n_neighbors':n_neighbors,'n_jobs':[-1]} clf_knn=GridSearchCV(estimator=KNeighborsRegressor(),param_grid=params_dict,scoring='neg_mean_squared_log_error') clf_knn.fit(x_train,y_train) pred=clf_knn.predict(x_test) print((np.sqrt(mean_squared_log_error(pred,y_test))))",Yes,4,6.0 clf_knn.best_params_,No,5,2.0 "pred=clf_rf.predict(test_df.drop('datetime',axis=1)) d={'datetime':test['datetime'],'count':pred} ans=pd.DataFrame(d) ans.to_csv('answer.csv',index=False)",Yes,4,25.0 "# train . train = pd.read_csv(""../input/train.csv"") # train . print(train.shape) # train 5 . train.head()'",Yes,4,45.0 "# test test = pd.read_csv(""../input/test.csv"") # test . print(test.shape) # test 5 . test.head()'",Yes,4,45.0 "# . # train[""datetime""].dt.year # . # String . # Datetime . # datetime . train[""datetime""] = pd.to_datetime(train[""datetime""])'",No,5,16.0 "# train . print(train.shape) # "" "" . train[""datetime-year""] = train[""datetime""].dt.year train[""datetime-month""] = train[""datetime""].dt.month train[""datetime-day""] = train[""datetime""].dt.day train[""datetime-hour""] = train[""datetime""].dt.hour train[""datetime-minute""] = train[""datetime""].dt.minute train[""datetime-second""] = train[""datetime""].dt.second # 20180124 train[""datetime-dayofweek""] = train[""datetime""].dt.dayofweek # train . print(train.shape) # . train[[""datetime"", ""datetime-year"", ""datetime-month"", ""datetime-day"", ""datetime-hour"", ""datetime-minute"", ""datetime-second"", ""datetime-dayofweek""]].head()'",Yes,4,16.0 "# test datetime Type String datetime . test[""datetime""] = pd.to_datetime(test[""datetime""])'",No,5,16.0 "# train . print(test.shape) # datetime . test[""datetime-year""] = test[""datetime""].dt.year test[""datetime-month""] = test[""datetime""].dt.month test[""datetime-day""] = test[""datetime""].dt.day test[""datetime-hour""] = test[""datetime""].dt.hour test[""datetime-minute""] = test[""datetime""].dt.minute test[""datetime-second""] = test[""datetime""].dt.second # 20180124 test[""datetime-dayofweek""] = test[""datetime""].dt.dayofweek # train . print(test.shape) # . test[[""datetime"", ""datetime-year"", ""datetime-month"", ""datetime-day"", ""datetime-hour"", ""datetime-minute"", ""datetime-second"", ""datetime-dayofweek""]].head()'",Yes,4,8.0 "import seaborn as sns # . %matplotlib inline'",No,5,23.0 "sns.barplot(data=train, x=""weather"", y=""count"")",No,5,33.0 "sns.lmplot(data=train, x=""temp"", y=""atemp"")",No,5,33.0 "sns.distplot(train[""windspeed""])",No,5,33.0 "sns.barplot(data=train, x=""datetime-year"", y=""count"")",No,5,75.0 "sns.barplot(data=train, x=""datetime-month"", y=""count"")",No,5,75.0 "sns.barplot(data=train, x=""datetime-day"", y=""count"")",No,5,75.0 "sns.barplot(data=train, x=""datetime-hour"", y=""count"")",No,5,75.0 "sns.barplot(data=train, x=""datetime-minute"", y=""count"")",No,5,75.0 "sns.barplot(data=train, x=""datetime-second"", y=""count"")",No,5,33.0 "# Integer String . train[""datetime-year_month""] = train[""datetime-year""].astype(str) + ""-"" + train[""datetime-month""].astype(str) print(train.shape) train.info()'",Yes,3,40.0 "train[[""datetime-year"", ""datetime-month"", ""datetime-year_month""]]",No,5,41.0 "sns.barplot(data=train, x=""datetime-year_month"", y=""count"")",No,5,33.0 "# ! import matplotlib.pyplot as plt plt.figure(figsize=(24,4)) sns.barplot(data=train, x=""datetime-year_month"", y=""count"")'",Yes,5,33.0 "plt.figure(figsize=(24,4)) sns.pointplot(data=train, x=""datetime-hour"", y=""count"")",No,5,33.0 "plt.figure(figsize=(24,4)) sns.pointplot(data=train, x=""datetime-hour"", y=""count"", hue=""workingday"")",No,5,75.0 "plt.figure(figsize=(24,4)) sns.pointplot(data=train, x=""datetime-hour"", y=""count"", hue=""datetime-dayofweek"")",No,5,33.0 "# x_train . x_train = train[feature_names] # x_train . print(x_train.shape) # x_test 5 . x_train.head()'",Yes,4,41.0 "# x_test . x_test = test[feature_names] # x_test . print(x_test.shape) # x_test 5 . x_test.head()'",Yes,4,41.0 "# . label_name = ""count"" y_train = train[label_name] print(y_train.shape) y_train.head()'",Yes,4,58.0 "from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor() model",Yes,5,4.0 "from sklearn.model_selection import cross_val_predict y_predict = cross_val_predict(model, x_train, y_train, cv=20) print(y_predict.shape) y_predict",Yes,4,27.0 "score = abs(y_train - y_predict).mean() f""score(Mean Absolute Error)={score:.6f}""",No,5,28.0 "# . model.fit(x_train, y_train)'",No,5,7.0 "# x_test . predictions = model.predict(x_test) # array . predictions'",No,5,48.0 "# predictions[:5]'",No,3,41.0 "submit = pd.read_csv(""../input/sampleSubmission.csv"") print(submit.shape) submit.head()",Yes,3,45.0 "submit[""count""] = predictions print(submit.shape) submit.head()",Yes,3,41.0 "submit.to_csv(""baseline-script.csv"", index=False) pd.read_csv(""baseline-script.csv"").head()",Yes,4,25.0 "df_train = pd.read_csv('../input/train.csv') df_train.head()",Yes,4,45.0 dtIdx = pd.DatetimeIndex(df_train['datetime']),No,5,16.0 "df_train['hour'] = dtIdx.hour df_train['dayofweek'] = dtIdx.dayofweek df_train['month'] = dtIdx.month df_origin = df_train",No,4,16.0 "df_train = df_train.drop(['casual', 'registered', 'datetime'], axis = 1) df_train.head()",Yes,5,10.0 "df_train_data = df_train.drop('count', axis=1)",No,5,10.0 df_train_target = df_train['count'],No,2,13.0 df_train_target.head(),No,5,41.0 "from sklearn import linear_model from sklearn import svm from sklearn.ensemble import RandomForestRegressor from sklearn import model_selection",No,5,22.0 "ms = model_selection.ShuffleSplit(n_splits=3, test_size=0.2, random_state=0)",No,5,13.0 df_train_data.head(),No,5,41.0 "X_train, X_test, y_train, y_test = model_selection.\\ train_test_split(df_train_data, df_train_target, test_size = 0.2, random_state=0)'",No,5,13.0 \,No,5,6.0 "import matplotlib.pyplot as plt from sklearn.model_selection import learning_curve from sklearn.model_selection import ShuffleSplit def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)): """""" Generate a simple plot of the test and training learning curve. Parameters ---------- estimator : object type that implements the ""fit"" and ""predict"" methods An object of that type which is cloned for each validation. title : string Title for the chart. X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples) or (n_samples, n_features), optional Target relative to X for classification or regression; None for unsupervised learning. ylim : tuple, shape (ymin, ymax), optional Defines minimum and maximum yvalues plotted. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 3-fold cross-validation, - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For integer/None inputs, if ``y`` is binary or multiclass, :class:`StratifiedKFold` used. If the estimator is not a classifier or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. Refer :ref:`User Guide ` for the various cross-validators that can be used here. n_jobs : int or None, optional (default=None) Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. train_sizes : array-like, shape (n_ticks,), dtype float or int Relative or absolute numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a fraction of the maximum size of the training set (that is determined by the selected validation method), i.e. it has to be within (0, 1]. Otherwise it is interpreted as absolute sizes of the training sets. Note that for classification the number of samples usually have to be big enough to contain at least one sample from each class. (default: np.linspace(0.1, 1.0, 5)) """""" plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel(""Training examples"") plt.ylabel(""Score"") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color=""r"") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color=""g"") plt.plot(train_sizes, train_scores_mean, 'o-', color=""r"", label=""Training score"") plt.plot(train_sizes, test_scores_mean, 'o-', color=""g"", label=""Cross-validation score"") plt.legend(loc=""best"") return plt # cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=0) # title = ""Learning Curves (RFR, n_estimators=100)"" # estimator = RandomForestRegressor(n_estimators=100) # plot_learning_curve(estimator, title, # df_train_data, df_train_target, ylim=(0.7, 1.01), cv=cv, n_jobs=4) # plt.show()'",Yes,4,35.0 df_origin.columns,No,5,71.0 "df_origin.groupby('windspeed').mean().plot(y='count', marker='o')",No,3,33.0 "df_origin.groupby('humidity').mean().plot(y='count', marker='o')",No,4,33.0 d = df_origin.groupby('humidity'),No,5,60.0 "corr = df_origin[['temp','weather','windspeed','dayofweek', 'month', 'hour','count']].corr() corr",No,5,40.0 "import matplotlib.pyplot as plt plt.figure() plt.matshow(corr) plt.colorbar() plt.show()",Yes,5,80.0 "df_test = pd.read_csv('../input/test.csv') df_test.head()",Yes,4,45.0 "df_sample = pd.read_csv('../input/sampleSubmission.csv') df_sample.head()",Yes,4,45.0 "df_test['hour'] = pd.DatetimeIndex(df_test['datetime']).hour df_test['dayofweek'] = pd.DatetimeIndex(df_test['datetime']).dayofweek df_test['month'] = pd.DatetimeIndex(df_test['datetime']).month df_test_data = df_test.drop(['datetime'], axis=1) df_test_data.head()",Yes,4,8.0 "score = rfr.score(df_train_data, df_train_target)",No,5,28.0 "print(""score: %.3f""%score)",No,5,84.0 df_sample.head(),No,5,41.0 df_sample.info(),No,5,40.0 "df_sample.to_csv('submission.csv', index=False)",No,5,25.0 "df_demo = pd.read_csv('submission.csv') df_demo.head()",No,4,45.0 "import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import os print(os.listdir(""../input"")) import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import datetime from sklearn import datasets, linear_model from sklearn.metrics import mean_squared_error %matplotlib inline data = pd.read_csv(""../input/train.csv"") test = pd.read_csv(""../input/test.csv"") test.dtypes ",Yes,3,45.0 "print(test.head(5)) ",No,5,41.0 "# Extract hours from datetime data['datetime'] = pd.to_datetime(data['datetime']) data['hour'] = data['datetime'].dt.hour data['month'] = data['datetime'].dt.month test['datetime'] = pd.to_datetime(test['datetime']) test['hour'] = data['datetime'].dt.hour test['month'] = data['datetime'].dt.month data['season'] = data.season.astype('category') data['month'] = data.month.astype('category') data['hour'] = data.hour.astype('category') data['holiday'] = data.holiday.astype('category') data['workingday'] = data.workingday.astype('category') data['weather'] = data.weather.astype('category') test['season'] = test.season.astype('category') test['month'] = test.month.astype('category') test['hour'] = test.hour.astype('category') test['holiday'] = test.holiday.astype('category') test['workingday'] = test.workingday.astype('category') test['weather'] = test.weather.astype('category') data.dtypes",No,3,16.0 "data = data.drop(['atemp', 'casual', 'registered', 'windspeed'], axis=1) test = test.drop(['atemp','windspeed'], axis=1)",No,5,10.0 test.head(2),No,5,41.0 "import math data['count'] = data['count'].transform(lambda x: math.log(x))",Yes,5,8.0 "data = data.drop(['datetime'], axis=1) data_dummy = data #test = test.drop(['datetime'], axis=1) test_dummy = test def dummify_dataset(df, column): df = pd.concat([df, pd.get_dummies(df[column], prefix=column, drop_first=True)],axis=1) df = df.drop([column], axis=1) return df columns_to_dummify = ['season', 'month', 'hour', 'holiday', 'workingday', 'weather'] for column in columns_to_dummify: data_dummy = dummify_dataset(data_dummy, column) test_dummy = dummify_dataset(test_dummy, column) test_dummy.head(5)",Yes,5,20.0 "from sklearn.model_selection import train_test_split y = data_dummy['count'] X = data_dummy.drop(['count'], axis=1) X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state=42) ",Yes,4,21.0 "from sklearn.tree import DecisionTreeRegressor from sklearn.linear_model import LinearRegression, Ridge, HuberRegressor, ElasticNetCV from sklearn.metrics import mean_squared_log_error from sklearn.model_selection import cross_val_score from sklearn.model_selection import KFold from sklearn.ensemble import BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor etr = ExtraTreesRegressor(max_depth= 20, n_estimators= 500) #etr.fit(X_train, y_train) #Y_to_train = train_sample[""count""] #X_to_train = train_sample.drop(['count'], axis=1) etr.fit(X_train,y_train) #y_pred = etr.predict(test_sample)'",Yes,5,7.0 "test_with_datetime = pd.read_csv(""../input/test.csv"") test_dummy = test_dummy.drop(['datetime'], axis=1) test_predictions = etr.predict(test_dummy)'",Yes,3,48.0 "predictions = np.exp(test_predictions ) submission = pd.DataFrame({ 'datetime': test.datetime.values, 'count': predictions }) submission.to_csv(""my_submission_10.csv"", index=False)'",Yes,5,25.0 "train = pd.read_csv('../input/train.csv') train.head()",Yes,4,45.0 "test = pd.read_csv('../input/test.csv') test.head()",Yes,4,45.0 "train = pd.read_csv(""../input/train.csv"", parse_dates = [""datetime""]) test = pd.read_csv(""../input/test.csv"", parse_dates = [""datetime""])",No,5,45.0 train.dtypes,No,5,70.0 "train[""year""] = train[""datetime""].dt.year train[""hour""] = train[""datetime""].dt.hour train[""dayofweek""] = train[""datetime""].dt.dayofweek test[""year""] = test[""datetime""].dt.year test[""hour""] = test[""datetime""].dt.hour test[""dayofweek""] = test[""datetime""].dt.dayofweek",No,5,8.0 "train.drop([""datetime"", ""windspeed"", ""casual"", ""registered"", ""count""], 1, inplace=True) test.drop([""datetime"", ""windspeed""], 1, inplace=True)",No,5,10.0 "from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(n_estimators=100) rf.fit(train,y_train) preds = rf.predict(test)",Yes,4,7.0 "submission = pd.read_csv('../input/sampleSubmission.csv') submission.head()",Yes,4,45.0 "submission[""count""] = np.expm1(preds) submission.head()",Yes,5,55.0 "submission.to_csv(""allrf.csv"", index=False)",No,5,25.0 "sample=pd.read_csv('../input/sampleSubmission.csv') train_df=pd.read_csv('../input/train.csv') test_df=pd.read_csv('../input/test.csv')",No,5,45.0 sample.head(),No,5,41.0 train_df.nunique(),No,5,54.0 test_df.nunique(),No,5,54.0 train_df.describe(),No,5,40.0 season_df=train_df.groupby('season'),No,5,60.0 season_df.head(),No,5,41.0 "train_df['hour']=hour train_df['day']=day train_df['month']=month train_df['year']=year",No,2,16.0 "hour=[] day=[] month=[] year=[] for row in test_df['datetime']: date_hour=row.split() date=date_hour[0] hour_row=date_hour[1] hour.append(hour_row.split(':')[0]) date=date.split('-') day.append(date[2]) month.append(date[1]) year.append(date[0]) test_df['hour']=hour test_df['day']=day test_df['month']=month test_df['year']=year",No,5,8.0 "datetime=['hour','day','month','year'] for time in datetime: train_df[time]=train_df[time].astype(int) test_df[time]=test_df[time].astype(int)",No,4,16.0 "#Continous Features Analysis for i in range(len(cont_feat)-1): for j in range(i+1,len(cont_feat)): sns.jointplot(cont_feat[i],cont_feat[j],data=train_df) plt.title('{} relation with {}'.format(cont_feat[i],cont_feat[j])) plt.show() ",No,5,33.0 "#Categorical feature analysis for cat in cat_feat: sns.barplot(x=cat,y='count',data=train_df,estimator=sum) plt.title('{} vs total_rent'.format(cat)) plt.show()",No,4,81.0 "climate=['temp','humidity','windspeed'] for clim in climate: sns.swarmplot(x='hour',y=clim,hue='season',data=train_df) plt.title('{} vs {}'.format('hour',clim)) plt.show()",No,5,33.0 "sns.distplot(train_df['count']) train_df['count']=train_df['count'].apply(lambda x:np.log(x))",No,4,33.0 sns.heatmap(train_df.corr()),No,5,80.0 train_df=pd.DataFrame(train_df),No,5,12.0 "train_df.set_index('datetime',inplace=True)",No,5,61.0 "test_df.set_index('datetime',inplace=True)",No,4,61.0 "train_df.drop(columns=['casual','registered'],axis=1,inplace=True)",No,4,61.0 "weather_df=pd.get_dummies(train_df['weather'],prefix='weather') yr_df=pd.get_dummies(train_df['year'],prefix='year') month_df=pd.get_dummies(train_df['month'],prefix='month') hour_df=pd.get_dummies(train_df['hour'],prefix='hour') season_df=pd.get_dummies(train_df['season'],prefix='season') train_df=train_df.join(weather_df) train_df=train_df.join(yr_df) train_df=train_df.join(month_df) train_df=train_df.join(hour_df) train_df=train_df.join(season_df) weather_df=pd.get_dummies(test_df['weather'],prefix='weather') yr_df=pd.get_dummies(test_df['year'],prefix='year') month_df=pd.get_dummies(test_df['month'],prefix='month') hour_df=pd.get_dummies(test_df['hour'],prefix='hour') season_df=pd.get_dummies(test_df['season'],prefix='season') test_df=test_df.join(weather_df) test_df=test_df.join(yr_df) test_df=test_df.join(month_df) test_df=test_df.join(hour_df) test_df=test_df.join(season_df)",No,4,20.0 "train_df.drop(columns=['season','hour','month','year','weather'],axis=1,inplace=True) test_df.drop(columns=['season','hour','month','year','weather'],axis=1,inplace=True)",No,5,10.0 "def rmlse(predicted,actual): sum_val=0 for i in range(len(predicted)): sum_val+=(np.log(predicted[i]+1)-np.log(actual[i]+1))**2 return (sum_val/len(predicted))**(0.5)",No,5,84.0 "X=train_df.drop(columns='count',axis=1) y=train_df['count']",No,5,21.0 X.info(),No,5,40.0 "from xgboost import XGBRegressor from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=42) ''' param_test1 = { 'max_depth':range(3,10,2), 'min_child_weight':range(1,6,2) } ''' ''' param_test2={ 'gamma':[0,0.125,0.25,0.5,0.75,1] } ''' ''' param_test3={ 'min_child_weight':[1,2,3,4,5,6,7,8,9] } ''' ''' param_test4={ 'learning_rate':[0.1,0.01,0.001] } ''' param_test5={ 'subsample':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] } gsearch1 = GridSearchCV(estimator = XGBRegressor( learning_rate =0.125, n_estimators=1000, max_depth=9, min_child_weight=4, gamma=0.125, subsample=0.8, colsample_bytree=0.8, random_state=42), param_grid = param_test5,n_jobs=4,iid=False, cv=5) gsearch1.fit(X_train,y_train) predicted=gsearch1.predict(X_test) print('Model Score: {}'.format(rmlse(np.exp(predicted),np.exp(y_test)))) print(gsearch1.best_params_)",No,4,6.0 y_test,Yes,5,41.0 "import xgboost as xgb xgr=xgb.XGBRegressor(learning_rate =0.1, n_estimators=1000, max_depth=9, min_child_weight=4, gamma=0.125, subsample=1, colsample_bytree=0.8) xgr.fit(train_df.drop(columns='count',axis=1),train_df['count']) y_predict=xgr.predict(test_df)",No,4,7.0 test_df['count']=np.exp(y_predict),Yes,5,8.0 result=pd.DataFrame(),No,5,12.0 "result['datetime']=test_df['datetime'] result['count']=test_df['count']",No,5,55.0 "result.to_csv('output.csv',index=False)",No,5,25.0 "sns.pointplot(x=df['temp'], y=df['count']) fig = plt.gcf() fig.set_size_inches(30,12);",No,5,33.0 "sns.pointplot(x=df['atemp'], y=df['count']) fig = plt.gcf() fig.set_size_inches(30,12);",No,5,33.0 "_, _, r_value, _, _ = stats.linregress(df['count'], df['atemp']) r_square = r_value ** 2 r_square.round(2)",No,5,47.0 "sns.pointplot(x=df['hour'], y=df['count']) fig = plt.gcf() fig.set_size_inches(30,12);",No,5,75.0 "sns.pointplot(x=df['temp'], y=df['atemp']) fig = plt.gcf() fig.set_size_inches(30,12);",No,5,33.0 "_, _, r_value, _, _ = stats.linregress(df['temp'], df['atemp']) r_square = r_value ** 2 r_square.round(2)",No,5,47.0 "df = df.drop(labels='atemp', axis=1)",No,5,10.0 "df = df.drop(labels='count_log', axis=1)",No,5,10.0 "df = df.drop(labels='count_boxcox', axis=1)",No,5,10.0 "df = pd.get_dummies(df, columns=['weather']) df.head()",No,5,20.0 "df = df.drop(labels='weather_4', axis=1) df.head()",No,5,10.0 "df['temp_weath_1'] = df['temp'] * df['weather_1'] df['temp_weath_2'] = df['temp'] * df['weather_2'] df['temp_weath_3'] = df['temp'] * df['weather_3']",No,5,8.0 "df['temp_weath_1'] = df['temp_weath_1'].astype(int) df['temp_weath_2'] = df['temp_weath_2'].astype(int) df['temp_weath_3'] = df['temp_weath_3'].astype(int)",No,5,16.0 "X = df.loc[:, df.columns != 'count'] y = np.log(df['count'])",No,5,21.0 "X.shape, y.shape",No,5,58.0 "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)",No,5,13.0 "X_train.shape, y_train.shape, X_test.shape, y_test.shape",No,5,58.0 "from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, Ridge from sklearn.tree import DecisionTreeRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor from sklearn.svm import SVR from sklearn.pipeline import Pipeline, make_pipeline from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler, Normalizer, minmax_scale, QuantileTransformer, RobustScaler, PolynomialFeatures from sklearn.model_selection import KFold, cross_val_score from xgboost import XGBRegressor",No,5,22.0 "pipelines = [] pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())]))) pipelines.append(('ScaledLASSO', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('LASSO', Lasso(random_state=42))]))) pipelines.append(('ScaledRID', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('RID', Ridge(random_state=42))]))) pipelines.append(('ScaledKNN', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor(n_neighbors=2))]))) pipelines.append(('ScaledCART', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor(random_state=42))]))) pipelines.append(('ScaledGBM', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('GBM', GradientBoostingRegressor(random_state=42))]))) pipelines.append(('ScaledRFR', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('RFR', RandomForestRegressor(random_state=42))]))) pipelines.append(('ScaledSVR', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('SVR', SVR(kernel='linear'))]))) pipelines.append(('ScaledXGBR', Pipeline([('poly', PolynomialFeatures()), ('Scaler', StandardScaler()), ('XGBR', XGBRegressor(random_state=42))]))) results = [] names = [] for name, model in pipelines: kfold = KFold(random_state=42) cv_results = -cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_log_error') results.append(np.sqrt(cv_results)) names.append(name) msg = ""{}: {} ({})"".format(name, cv_results.mean(), cv_results.std()) print(msg)'",No,5,79.0 "df_test = pd.read_csv(""../input/test.csv"")",No,5,45.0 df_test['datetime'] = pd.to_datetime(df_test['datetime']),No,5,16.0 df_test['dow'] = df_test['datetime'].dt.dayofweek,No,5,8.0 df_test['month'] = df_test['datetime'].dt.month,No,5,8.0 df_test['week'] = df_test['datetime'].dt.week,No,5,8.0 df_test['hour'] = df_test['datetime'].dt.hour,No,5,8.0 df_test['year'] = df_test['datetime'].dt.year,No,5,8.0 df_test['day'] = df_test['datetime'].dt.day,No,5,8.0 "df_test = df_test.drop(labels='datetime', axis=1)",No,5,10.0 "df_test = df_test.drop(labels='atemp', axis=1)",No,5,10.0 "df_test = pd.get_dummies(df_test, columns=['weather'])",No,5,20.0 "df_test = df_test.drop(labels='weather_4', axis=1)",No,5,10.0 "df_test['temp_weath_1'] = df_test['temp'] * df_test['weather_1'] df_test['temp_weath_2'] = df_test['temp'] * df_test['weather_2'] df_test['temp_weath_3'] = df_test['temp'] * df_test['weather_3']",No,5,8.0 "df_test['temp_weath_1'] = df_test['temp_weath_1'].astype(int) df_test['temp_weath_2'] = df_test['temp_weath_2'].astype(int) df_test['temp_weath_3'] = df_test['temp_weath_3'].astype(int)",No,5,16.0 "standardscaler = StandardScaler() model = XGBRegressor(colsample_bytree=0.7, learning_rate=0.05, max_depth=7, min_child_weight=4, subsample=0.7, random_state=42)",No,5,4.0 "model.fit(X_train, y_train)",No,5,7.0 model.predict(df_test),No,5,48.0 "pipe = Pipeline([('poly', PolynomialFeatures()), ('StandardScaler', standardscaler), ('XGBR', model)]) pipe.fit(X_train, y_train) y_pred = np.exp(pipe.predict(df_test)) y_pred",Yes,4,7.0 "df_test[['count']].to_csv('submission.csv', index=True)",No,5,25.0 df_test[['count']].head(),No,5,41.0 "# Carregar os dados df = pd.read_csv('../input/train.csv', parse_dates=[0]) test = pd.read_csv('../input/test.csv', parse_dates=[0])",No,5,45.0 "b""# Transformao da coluna datetime (feature engineering)\ndf['year'] = df['datetime'].dt.year\ndf['month'] = df['datetime'].dt.month\ndf['day'] = df['datetime'].dt.day\ndf['dayofweek'] = df['datetime'].dt.dayofweek\ndf['hour'] = df['datetime'].dt.hour""",No,5,8.0 "# Ordenar os dados pela coluna datetime df.sort_values('datetime', inplace=True)",No,5,9.0 "# Separando os dataframes test = df[df['count'].isnull()] df = df[~df['count'].isnull()]",No,5,13.0 "# Separando o df em treino e validao from sklearn.model_selection import train_test_split'",No,5,22.0 "train, valid = train_test_split(df, random_state=42)",No,5,13.0 "# Usar o modelo de RandomForest # Importar o modelo from sklearn.ensemble import RandomForestRegressor",No,5,22.0 "# Instanciar o modelo rf = RandomForestRegressor(random_state=42)",No,5,4.0 "# Treinar o modelo rf.fit(train[feats], train['count'])",No,5,7.0 "# Fazendo as previses preds = rf.predict(valid[feats])'",No,5,48.0 "# Analisar as previses com base na mtrica # Importando a mtrica from sklearn.metrics import mean_squared_error'",No,5,22.0 "b""# Validando as previses\nmean_squared_error(valid['count'], preds) ** (1/2)""",No,5,49.0 "# Melhorando o modelo de RandomForest rf = RandomForestRegressor(random_state=42, n_estimators=200, n_jobs=-1)",No,5,4.0 "# Preparando os dados para o kaggle # Criando as previses para os dados de teste preds_test = rf.predict(test[feats])'",No,5,48.0 "# Salvando o arquivo para o Kaggle test[['datetime', 'count']].to_csv('rf.csv', index=False)",No,5,25.0 "df = pd.read_csv('../input/train.csv', parse_dates=[0]) test = pd.read_csv('../input/test.csv', parse_dates=[0])",No,5,45.0 "df.rename(columns={'count':'rentals'},inplace=True)",No,5,61.0 "df = df.append(test, sort=False)",No,5,11.0 "df['year'] = df.datetime.dt.year # df['datetime'].dt.year df['month'] = df.datetime.dt.month df['day'] = df.datetime.dt.day df['dayofweek'] = df.datetime.dt.dayofweek df['hour'] = df['datetime'].dt.hour",No,5,8.0 df.sort_index(inplace=True),No,5,9.0 "# Separando os dataframes test = df[df['rentals'].isnull()] df = df[~df['rentals'].isnull()]",No,5,13.0 "from sklearn.model_selection import train_test_split train, valid = train_test_split(df, random_state=42)",No,5,13.0 "from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(random_state=42, n_estimators=100,n_jobs=-1)",Yes,4,4.0 "rf.fit(train[feats],train['rentals'])",No,5,7.0 "# Fazer as previses preds = rf.predict(valid[feats])'",No,4,48.0 "from sklearn.metrics import mean_squared_error mean_squared_error(valid['rentals'],preds)**(1/2)",Yes,4,49.0 "b""# Adicionar as previses ao dataframe\ntest['count'] = np.exp(preds_test)-1\n""",No,4,8.0 "test[['datetime','count']]",No,5,41.0 "train, valid = df[df['day'] <= 15], df.query('day > 15')",No,5,14.0 "# Preparando os dados para o kaggle # Criando as previses para os dados de teste preds_test = rf.predict(test[feats])'",No,5,48.0 "# Salvando o arquivo pro kaggle test[['datetime','count']].to_csv('rf2.csv', index=False) ",No,5,25.0 "from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(random_state=42, n_estimators=100,n_jobs=-1,oob_score=True)",No,5,4.0 "rf.fit(df[feats],df['rentals']) rf.oob_score_",No,4,7.0 "# Fazer as previses preds = rf.oob_prediction_'",Yes,4,22.0 "from sklearn.metrics import mean_squared_error mean_squared_error(df['rentals'],preds)**(1/2)",Yes,4,7.0 "# Salvando o arquivo pro kaggle test[['datetime','count']].to_csv('rf3.csv', index=False) ",No,5,25.0 " def cv(df, test, feats, y_name, k=5): score, preds, fis = [], [], [] chunk = df.shape[0] // k for i in range(k): if i+1 < k: valid = df.iloc[i*chunk: (i+1)*chunk] train = df.iloc[:i*chunk].append(df.iloc[(i+1)*chunk:]) else: valid: df.iloc[i*chunk:] train: df.iloc[:i*chunk] rf = RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=100) rf.fit(train[feats],train[y_name]) score.append(mean_squared_error(valid[y_name],rf.predict(valid[feats])) ** (1/2)) preds.append(rf.predict(test[feats])) fis.append(rf.feature_importances_) print(i, 'OK') return score, preds, fis",No,2,7.0 "score, preds, fis = cv(df, test, feats, 'rentals')",No,5,49.0 score,No,2,7.0 "pd.Series(score).mean() ",No,5,7.0 "test['count'] = np.exp(pd.DataFrame(preds).mean())-1 test[['datetime','count']].to_csv('rf4.csv', index=False) ",No,5,25.0 "import numpy as np import pandas as pd import pandas_profiling as pp import seaborn as sns import matplotlib.pyplot as plt import os import datetime",No,5,22.0 "import warnings warnings.filterwarnings('ignore')",No,5,23.0 "HOME_PATH = r'../input' os.listdir(HOME_PATH)",No,5,88.0 train_set = pd.read_csv(HOME_PATH+'/train.csv'),No,5,45.0 "def month_to_num(df): """""" Convert month to numerical """""" for i in range(1,10): df['month'].loc[df['month']=='0'+str(i)] = i for i in range(10,13): df['month'].loc[df['month']==str(i)] = i'",No,5,20.0 "def time_to_num(df): """""" Convert time to numerical """""" for i in range(0,10): df['time'].loc[df['time']=='0'+str(i)+':00:00'] = i for i in range(10,24): df['time'].loc[df['time']==str(i)+':00:00'] = i'",No,5,20.0 "def weekend(df): df['weekend'] = np.zeros_like(df['holiday']) df['weekend'].loc[(df['workingday'] == 0) & (df['holiday'] == 0)] = 1",No,5,8.0 "def weekday(df): df['weekday'] = df['datetime'].apply(lambda date : \\ datetime.datetime.strptime(str(date.split()[0]),""%Y-%m-%d"").weekday())'",No,5,8.0 "def process_df(df): split_datetime(df) round_temp(df) # month_to_num(df) time_to_num(df) weekend(df) weekday(df) return df.drop('datetime', axis=1)",Yes,4,8.0 "fig = plt.figure(figsize=(10, 4)) fig.add_subplot(1,2,1) sns.countplot(x='year', hue='count_bin', data=train_set.loc[train_set['year']==1]) fig.add_subplot(1,2,2) sns.countplot(x='year', hue='count_bin', data=train_set.loc[train_set['year']==2])",No,5,33.0 "fig = plt.figure(figsize=(20, 6)) fig.add_subplot(1,2,1) sns.countplot(x='weekday', hue='count_bin', data=train_set) fig.add_subplot(1,2,2) sns.countplot(x='time', hue='count_bin', data=train_set)",No,5,75.0 "sns.factorplot(x=""weekday"",y=""count"",data=train_set,kind='bar') sns.factorplot(x=""time"",y=""count"",data=train_set,kind='bar')'",No,5,75.0 "# fig = plt.figure(figsize=(10, 4)) # fig.add_subplot(1,2,1) sns.factorplot(x=""weekend"",y=""count"",data=train_set,kind='bar') # sns.countplot(x='weekend', hue='count_bin', data=train_set.loc[train_set['weekend']==1]) # fig.add_subplot(1,2,2) sns.factorplot(x=""workingday"",y=""count"",data=train_set,kind='bar') # sns.countplot(x='workingday', hue='count_bin', data=train_set.loc[train_set['workingday']==1])'",No,5,33.0 "sns.factorplot(x=""rounded_temp"",y=""count"",data=train_set,kind='bar')'",No,5,33.0 "sns.factorplot(x=""rounded_atemp"",y=""count"",data=train_set,kind='bar')'",No,5,33.0 "fig = plt.figure() fig.add_subplot(2,2,1) plt.hist(train_set['count'].loc[(train_set['count']<50) & (train_set['season']==1)]) fig.add_subplot(2,2,2) plt.hist(train_set['count'].loc[(train_set['count']<50) & (train_set['season']==2)]) fig.add_subplot(2,2,3) plt.hist(train_set['count'].loc[(train_set['count']<50) & (train_set['season']==3)]) fig.add_subplot(2,2,4) plt.hist(train_set['count'].loc[(train_set['count']<50) & (train_set['season']==4)])",No,5,33.0 "sns.factorplot(x=""season"",y=""count"",data=train_set,kind='bar')'",No,5,33.0 "sns.factorplot(x=""weather"",y=""count"",data=train_set,kind='bar')'",No,5,33.0 "sns.factorplot(x=""temp_bin"",y=""count"",data=train_set,kind='bar')'",No,5,33.0 "from sklearn.metrics import mean_squared_log_error from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import KFold, cross_val_score, GridSearchCV import lightgbm as lgb from xgboost import XGBRegressor from sklearn.metrics import accuracy_score, make_scorer",No,5,22.0 "train_set = pd.get_dummies(train_set, columns=['season', 'weather', 'weekday', 'holiday']) # train_set = pd.get_dummies(train_set, columns=['season', 'weather', 'weekday', 'year', 'month', 'time'])",No,5,20.0 "train_set['temp_weather_1'] = train_set['temp'] * train_set['weather_1'] train_set['temp_weather_2'] = train_set['temp'] * train_set['weather_2'] train_set['temp_weather_3'] = train_set['temp'] * train_set['weather_3'] train_set['temp_weather_4'] = train_set['temp'] * train_set['weather_4']",No,5,8.0 "y = train_set.loc[:, 'count'] X = train_set.drop(['count', 'count_bin', 'casual', 'registered'], axis=1) ",No,5,21.0 "rf = RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=200, oob_score=True, min_samples_split=4, max_features=0.9, max_depth=17) rf.fit(X, y)",No,5,7.0 "scores = cross_val_score(rf, X, y, cv=5, scoring='neg_mean_squared_log_error') print(""score, root score: "", scores, np.sqrt(np.abs(scores)))'",No,5,28.0 "d_importance = pd.DataFrame(columns=['features'], data=X.columns) d_importance['importance'] = rf.feature_importances_ d_importance.sort_values(by='importance',ascending=False).head(20)",No,5,79.0 "test_set = pd.read_csv(HOME_PATH+'/test.csv') y_test = test_set['datetime'] test_set = process_df(test_set) test_set = pd.get_dummies(test_set, columns=['season', 'weather', 'weekday', 'holiday'])",Yes,2,45.0 "test_set['temp_weather_1'] = test_set['temp'] * test_set['weather_1'] test_set['temp_weather_2'] = test_set['temp'] * test_set['weather_2'] test_set['temp_weather_3'] = test_set['temp'] * test_set['weather_3'] test_set['temp_weather_4'] = test_set['temp'] * test_set['weather_4'] test_set['temp_bin'] = np.floor(test_set['temp'])//5 # test_set['high_time'] = np.zeros_like(test_set['time']) # test_set['high_time'].loc[(((test_set['time'] > 6) & (test_set['time'] < 15)) | (test_set['time'] == 20))] = 1 # test_set['high_time'].loc[((test_set['time'] == 8) | (test_set['time'] == 16) | (test_set['time'] == 19))] = 2 # test_set['high_time'].loc[((test_set['time'] == 17) | (test_set['time'] == 18))] = 3",No,5,8.0 test_set.head(),No,5,41.0 "predictions = np.zeros_like(y_test) predictions = (rf.predict(test_set)).round().astype(int) predictions[predictions < 0] = 0 submission = pd.concat([y_test, pd.Series(predictions, name=""count"")], axis=1) print(submission.head(30)) submission.to_csv(""submission.csv"", index=False)",Yes,3,25.0 "import numpy as np import pandas as pd train = pd.read_csv(""../input/train.csv"", parse_dates = [""datetime""]) test = pd.read_csv(""../input/test.csv"", parse_dates = [""datetime""]) train[""year""] = train[""datetime""].dt.year train[""hour""] = train[""datetime""].dt.hour train[""dayofweek""] = train[""datetime""].dt.dayofweek test[""year""] = test[""datetime""].dt.year test[""hour""] = test[""datetime""].dt.hour test[""dayofweek""] = test[""datetime""].dt.dayofweek y_casual = np.log1p(train.casual) y_registered = np.log1p(train.registered) #y_train = np.log1p(train[""count""]) train.drop([""datetime"", ""windspeed"", ""casual"", ""registered"", ""count""], 1, inplace=True) test.drop([""datetime"", ""windspeed"", ], 1, inplace=True) import lightgbm as lgb hyperparameters = { 'colsample_bytree': 0.725, 'learning_rate': 0.013, 'num_leaves': 56, 'reg_alpha': 0.754, 'reg_lambda': 0.071, 'subsample': 0.523, 'n_estimators': 1093} model = lgb.LGBMRegressor(**hyperparameters) model.fit(train, y_casual) preds1 = model.predict(test) hyperparameters = { 'colsample_bytree': 0.639, 'learning_rate': 0.011, 'num_leaves': 30, 'reg_alpha': 0.351, 'reg_lambda': 0.587, 'subsample': 0.916, 'n_estimators': 2166} model = lgb.LGBMRegressor(**hyperparameters, ) model.fit(train, y_registered) preds2 = model.predict(test) submission=pd.read_csv(""../input/sampleSubmission.csv"") submission[""count""] = np.expm1(preds1) + np.expm1(preds2) #submission.to_csv(""allrf.csv"", index=False)'",Yes,1,22.0 "pd.options.display.max_rows = 200 submission[""holiday""] = test[""holiday""] submission.loc[(submission[""holiday""]==1)]",No,4,55.0 "# Filter cover type and then barplot of wilderness area to see if any trees grow exclusively in a region. #data.describe() data = dtrain.groupby(['Cover_Type'])[['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4']].sum() # Transpose to get numbers by wilderness type. data.T.plot(kind = 'bar', figsize = (12,8)) plt.show()",Yes,3,8.0 "# Drop Soil type 15,7 - They have no variation. dtrain.drop(['Soil_Type7', 'Soil_Type15'], axis = 1, inplace = True) # filtering all columns that contain the str Soil soil_columns = dtrain.columns[dtrain.columns.str.contains('Soil')].to_list()",Yes,3,10.0 "data_soil = dtrain.groupby(['Cover_Type'])[soil_columns[:10]].sum() data_soil.T.plot(kind = 'bar', figsize = (18,8)) plt.show()",Yes,4,8.0 "data_soil = dtrain.groupby(['Cover_Type'])[soil_columns[10:20]].sum() data_soil.T.plot(kind = 'bar', figsize = (18,8)) plt.show()",Yes,4,8.0 "data_soil = dtrain.groupby(['Cover_Type'])[soil_columns[20:30]].sum() data_soil.T.plot(kind = 'bar', figsize = (18,8)) plt.show()",Yes,4,8.0 "data_soil = dtrain.groupby(['Cover_Type'])[soil_columns[30:]].sum() data_soil.T.plot(kind = 'bar', figsize = (18,8)) plt.show()",Yes,4,8.0 "label = dtrain['Cover_Type'] dtrain.drop(['Cover_Type'], axis = 1, inplace=True)",No,5,10.0 "from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV from sklearn.metrics import accuracy_score, f1_score, classification_report,confusion_matrix from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from xgboost import XGBClassifier",No,5,22.0 "x_train, x_test, y_train, y_test = train_test_split(dtrain, label, test_size = .3) dirty_clf = RandomForestClassifier() dirty_clf.fit(x_train, y_train) print(dirty_clf.score(x_test, y_test)) imp_feat = pd.DataFrame(index= dtrain.columns.to_list() , data= dirty_clf.feature_importances_) imp_feat.rename(columns={0 : 'Importance'}, inplace=True) imp_feat.sort_values(by='Importance', axis =0, ascending=False)[:15]",Yes,2,4.0 "baseline_features = ['Elevation', 'Horizontal_Distance_To_Roadways'] features = ['Elevation', 'Horizontal_Distance_To_Roadways', 'Horizontal_Distance_To_Hydrology', 'Horizontal_Distance_To_Fire_Points', 'Aspect','Wilderness_Area1', 'Wilderness_Area4', 'Soil_Type3', 'Soil_Type4','Soil_Type10', 'Soil_Type29', 'Soil_Type38'] x_train, x_test, y_train, y_test = train_test_split(dtrain[features], label, test_size = .3)",Yes,3,21.0 "clf = DecisionTreeClassifier(criterion='gini', max_depth=8, min_samples_split=2, class_weight= None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_weight_fraction_leaf=0.0, presort='deprecated', random_state=None,)",No,5,4.0 "grid_params = {'criterion' : [""gini"", ""entropy""]} grid = GridSearchCV(estimator=clf, param_grid=grid_params, n_jobs=-1, cv = 5) grid.fit(x_train, y_train) grid.score(x_test, y_test)'",Yes,2,4.0 "grid.best_estimator_ y_pred = grid.predict(x_test)",No,5,48.0 "clf.fit(x_train, y_train) print(f'No of Leaves : {clf.get_n_leaves()}') clf.feature_importances_",Yes,2,7.0 "rnd_clf = RandomForestClassifier() grid_params_1 = {'max_depth' : [18], 'n_estimators' : [127], 'criterion':['entropy']} grid = GridSearchCV(estimator=rnd_clf, param_grid=grid_params_1, n_jobs=-1, cv = 5) grid.fit(x_train, y_train)",Yes,4,4.0 "final_clf = RandomForestClassifier(max_depth=18, n_estimators=127, criterion='entropy') final_clf.fit(x_train, y_train) print(final_clf.score(x_train, y_train)) print(final_clf.score(x_test, y_test)) y_hat = final_clf.predict(x_test)",Yes,2,4.0 "plt.figure(figsize=(8,8)) sns.heatmap(pd.DataFrame(confusion_matrix(y_test, y_pred), index = label_dict.values(), columns= label_dict.values()), annot=True, cbar = False) plt.show()",No,5,80.0 "imp_feat = pd.DataFrame(index= features , data= final_clf.feature_importances_) imp_feat.rename(columns={0 : 'Importance'}, inplace=True) imp_feat.sort_values(by='Importance', axis =0, ascending=False)",Yes,2,12.0 "xgb_clf = XGBClassifier(n_estimators=100, max_depth = 12)",No,5,4.0 "xgb_clf.fit(x_train, y_train) xgb_clf.score(x_test, y_test)",Yes,3,7.0 y_pred = xgb_clf.predict(x_test),No,5,48.0 "# Final Fit xgb_clf.fit(dtrain[features], label)",No,5,7.0 y_test_hat = xgb_clf.predict(dtest[features]),No,5,48.0 sns.distplot(dtest.Elevation),No,5,33.0 "df_submit = pd.read_csv(submit_path, index_col=0) df_submit['Cover_Type'] =y_test_hat df_submit.to_csv('submit_kaggle.csv')",Yes,4,25.0 " import os #donnes print(os.listdir(""../input"")) # Any results you write to the current directory are saved as output. import pandas as pd import numpy as np #random forest pour regressor from sklearn.ensemble import RandomForestRegressor #Pour le split feature et target from sklearn.model_selection import train_test_split #Pour regression au lieu de accuracy c'est mean_square_error from sklearn.metrics import mean_squared_error #Ne pas afficher le warning lors du fit par exemple #Import pour la cross_validation from sklearn.model_selection import cross_val_score #import random forest pour regression from sklearn.ensemble import RandomForestRegressor import warnings warnings.filterwarnings('ignore')'",Yes,4,22.0 "# Fichier de train X_train = pd.read_csv(""../input/train.csv"") #Fichier de test X_test = pd.read_csv(""../input/test.csv"")",No,5,45.0 "submission = pd.read_csv(""../input/sampleSubmission.csv"") submission[""count""] = 195",Yes,3,25.0 "#Convertir notre fichier en csv submission.to_csv('submission.csv', index=False)",No,5,25.0 X_test.columns,No,5,71.0 "## Definition foncyion features et target def split_dataset(df, features, target='count'): X = df[features] y = df[target] return X, y ",No,5,21.0 "## Fonction split date def date_split(df_train, df_test, date='datetime'): ##Traitement_df_train cols=df_train[date] date_cols=pd.to_datetime(cols) df_train['year'] = date_cols.dt.year df_train['month'] = date_cols.dt.month df_train['day'] = date_cols.dt.day df_train['hour'] = date_cols.dt.hour df_train['minute'] = date_cols.dt.minute df_train['second'] = date_cols.dt.second df_train = df_train.drop(['datetime'], axis=1) ##Traitement_df_test cols2=df_test[date] date_cols2=pd.to_datetime(cols2) df_test['year'] = date_cols2.dt.year df_test['month'] = date_cols2.dt.month df_test['day'] = date_cols2.dt.day df_test['hour'] = date_cols2.dt.hour df_test['minute'] = date_cols2.dt.minute df_test['second'] = date_cols2.dt.second df_test = df_test.drop(['datetime'], axis=1) return df_train, df_test ",Yes,4,21.0 "#Definition de X_train et X_test avec les memes columns X_train, X_test = date_split(X_train, X_test)",No,5,13.0 "#Appel de la fonction pour avoir le meme nombre de columns: X_trainGet_cols = Get_cols(X_test) numbers = X_trainGet_cols.select_dtypes(np.number) numbers.head()",Yes,2,41.0 "##Definition features and target X_train_features, y_train_target = split_dataset(X_train, features=numbers.columns) X_train_features, y_train_target",No,5,13.0 "##################Cross Validation ## random forest regressor #Import Random Forest pour regressor from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor() ##Cross Validation score = -cross_val_score(rf, X_train_features, y_train_target, cv=5, scoring='neg_mean_squared_error')",No,3,22.0 "############### FIT entrainer tout le set d'entrainement rf.fit(X_train_features, y_train_target)",No,5,7.0 "## Predict sur le train y_train_pred = rf.predict(X_train_features)",No,5,27.0 "###### Predict sur le test ## Voir si on a le meme nombre de columns dans test et dans train y_test_pred = rf.predict(X_test) ",No,5,48.0 "mean_train = mean_squared_error( y_train_target, y_train_pred) #mean_test = mean_squared_error(y_test, y_test_pred) mean_train #, mean_test",No,5,28.0 "submission = pd.read_csv(""../input/sampleSubmission.csv"") submission[""count""] = y_test_pred #Convertir notre fichier en csv submission.to_csv('submission.csv', index=False)'",Yes,3,45.0 submission.head(3),No,5,41.0 "import numpy as np import pandas as pd import seaborn as sns from scipy import stats import calendar from datetime import datetime import matplotlib.pyplot as plt %matplotlib inline ",Yes,4,22.0 "fig,axes = plt.subplots(2,2) fig.set_size_inches(12,10) sns.distplot(train['temp'],ax=axes[0,0]) sns.distplot(train['atemp'],ax=axes[0,1]) sns.distplot(train['humidity'],ax=axes[1,0]) sns.distplot(train['windspeed'],ax=axes[1,1]) axes[0,0].set(xlabel='temp',title='Distribtion of temp') axes[0,1].set(xlabel='atemp',title='Distribtion of atemp') axes[1,0].set(xlabel='humidity',title='Distribtion of humidity') axes[1,1].set(xlabel='windspeed',title='Distribtion of windspeed') ",Yes,3,33.0 "train['datetime'] = pd.to_datetime(train['datetime'],errors='coerce') train['date'] = train['datetime'].apply(lambda x: x.date()) train['year'] = train['datetime'].apply(lambda x: x.year) train['month'] = train['datetime'].apply(lambda x: x.month) train['weekday'] = train['datetime'].apply(lambda x: x.weekday()) train['hour'] = train['datetime'].apply(lambda x: x.hour).astype('int')",Yes,2,16.0 "dummies_month = pd.get_dummies(train['month'], prefix= 'month') dummies_season = pd.get_dummies(train['season'], prefix= 'season') dummies_weather = pd.get_dummies(train['weather'], prefix= 'weather') dummies_year = pd.get_dummies(train['year'], prefix= 'year') data=pd.concat([train,dummies_month,dummies_season,dummies_weather,dummies_year],axis=1) yLabels=data['count'] dropFeatures = ['casual' , 'count' , 'datetime' , 'registered' , 'date' ,'season', 'weather','month','year'] dataTrain = data.drop(dropFeatures,axis=1)",Yes,2,20.0 "from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_log_error from math import sqrt import statsmodels.api as sm",No,5,22.0 "X_train_ca, X_valid_ca, y_train_ca, y_valid_ca = train_test_split(dataTrain, train['casual'], test_size=0.3, random_state=42) X_train_re, X_valid_re, y_train_re, y_valid_re = train_test_split(dataTrain, train['registered'], test_size=0.3, random_state=42) X_train, X_valid, y_train, y_valid = train_test_split(dataTrain, train['count'], test_size=0.3, random_state=42)",No,5,13.0 "rfModel_ca = RandomForestRegressor(n_estimators=1000 , oob_score=True, random_state = 42) rfModel_ca.fit(X_train_ca , y_train_ca) rfModel_re = RandomForestRegressor(n_estimators=1000 , oob_score=True, random_state = 42) rfModel_re.fit(X_train_re , y_train_re) rfModel = RandomForestRegressor(n_estimators=1000 , oob_score=True, random_state = 42) rfModel.fit(X_train , y_train)",Yes,2,4.0 "preds_train_ca = rfModel_ca.predict( X = X_train_ca) rmsle_casual_train = sqrt(mean_squared_log_error(y_train_ca, preds_train_ca)) preds_valid_ca = rfModel_ca.predict( X = X_valid_ca) rmsle_casual_valid = sqrt(mean_squared_log_error(y_valid_ca, preds_valid_ca)) print('Casual train rmsle : %.5f, valid rmsle : %.5f' %(rmsle_casual_train, rmsle_casual_valid)) preds_train_re = rfModel_re.predict( X = X_train_re) rmsle_registered_train = sqrt(mean_squared_log_error(y_train_re, preds_train_re)) preds_valid_re = rfModel_re.predict( X = X_valid_re) rmsle_registered_valid = sqrt(mean_squared_log_error(y_valid_re, preds_valid_re)) print('Registered train rmsle : %.5f,valid rmsle : %.5f' %(rmsle_registered_train, rmsle_registered_valid))",Yes,3,27.0 "preds_train = rfModel.predict( X = X_train) rmsle_count_train = sqrt(mean_squared_log_error(y_train, preds_train)) preds_valid = rfModel.predict( X = X_valid) rmsle_count_valid = sqrt(mean_squared_log_error(y_valid, preds_valid)) print('Count train rmsle : %.5f, valid rmsle : %.5f' %(rmsle_count_train, rmsle_count_valid))",Yes,3,27.0 "preds_train_merge_count = preds_train_re + preds_train_ca preds_valid_merge_count = preds_valid_re + preds_valid_ca rmsle_merge_train = sqrt(mean_squared_log_error(y_train,preds_train_merge_count)) rmsle_merge_valid = sqrt(mean_squared_log_error(y_valid,preds_valid_merge_count)) print('(merge) Count train rmsle : %.5f, valid rmsle : %.5f'%(rmsle_merge_train, rmsle_merge_valid))",Yes,2,11.0 "test['datetime'] = pd.to_datetime(test['datetime'],errors='coerce') test['date'] = test['datetime'].apply(lambda x: x.date()) test['year'] = test['datetime'].apply(lambda x: x.year) test['month'] = test['datetime'].apply(lambda x: x.month) test['weekday'] = test['datetime'].apply(lambda x: x.weekday()) test['hour'] = test['datetime'].apply(lambda x: x.hour).astype('int')",Yes,2,16.0 "dummies_month = pd.get_dummies(test['month'], prefix= 'month') dummies_season = pd.get_dummies(test['season'], prefix= 'season') dummies_weather = pd.get_dummies(test['weather'], prefix= 'weather') dummies_year = pd.get_dummies(test['year'], prefix= 'year') data_test=pd.concat([test,dummies_month,dummies_season,dummies_weather,dummies_year],axis=1) datetimecol = test['datetime'] yLabels=data['count'] dropFeatures = ['datetime' , 'date' ,'season', 'weather','month','year'] dataTest = data_test.drop(dropFeatures,axis=1)",Yes,2,20.0 "predsTest= rfModel_re.predict(X = dataTest) + rfModel_ca.predict(X = dataTest) submission=pd.DataFrame({'datetime':datetimecol , 'count':[max(0,x) for x in predsTest]})",Yes,2,48.0 "submission.to_csv('sampleSubmission.csv',index=False)",No,5,25.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline import warnings warnings.filterwarnings('ignore', category=RuntimeWarning) import os print(os.listdir(""../input""))'",Yes,4,22.0 "bike_df = pd.read_csv('../input/train.csv') bike_df.shape",Yes,2,45.0 bike_df.info(),No,5,40.0 bike_df.head(),No,5,41.0 "bike_df.drop(['datetime', 'casual', 'registered'], axis=1, inplace=True)",No,5,10.0 y_target.hist(),No,5,33.0 "y_log_transform = np.log1p(y_target) y_log_transform.hist()",Yes,3,33.0 "coef = pd.Series(lr_reg.coef_, index=X_features.columns) coef_sort = coef.sort_values(ascending=False) sns.barplot(x=coef_sort.values, y=coef_sort.index)",Yes,3,9.0 "X_features_ohe = pd.get_dummies(X_features, columns=['year', 'month', 'hour', 'holiday', 'workingday', 'season', 'weather'])",No,5,20.0 X_features_ohe.head(),No,5,41.0 "coef = pd.Series(lr_reg.coef_, index=X_features_ohe.columns) coef_sort = coef.sort_values(ascending=False)[:15] sns.barplot(x=coef_sort.values, y=coef_sort.index)",No,5,79.0 "X_train, X_test, y_train, y_test = train_test_split(X_features_ohe, y_target_log, test_size=0.3, random_state=2019) from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from xgboost import XGBRegressor from lightgbm import LGBMRegressor rf_reg = RandomForestRegressor(n_estimators=500) gbm_reg = GradientBoostingRegressor(n_estimators=500) xgb_reg = XGBRegressor(n_estimators=500) lgbm_reg = LGBMRegressor(n_estimators=500) for model in [rf_reg, gbm_reg, xgb_reg, lgbm_reg]: get_model_predict(model, X_train, X_test, y_train, y_test, is_expm1=True)",No,3,4.0 submission = pd.read_csv('../input/sampleSubmission.csv'),No,5,45.0 "X_test = pd.read_csv('../input/test.csv') X_test.head()",Yes,3,45.0 "X_test.drop(['datetime'], axis=1, inplace=True) X_test.head()",Yes,2,10.0 "X_test_ohe = pd.get_dummies(X_test, columns=['year', 'month', 'hour', 'holiday', 'workingday', 'season', 'weather']) X_test_ohe.head()",Yes,2,20.0 prediction = lgbm_reg.predict(X_test_ohe),No,5,48.0 "submission.to_csv('./My_submission.csv', index=False)",No,5,25.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import os import datetime import calendar",Yes,4,22.0 "from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import mean_squared_error ",No,5,22.0 "from xgboost import XGBRegressor from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor",No,5,22.0 "import warnings warnings.filterwarnings(""ignore"")",Yes,4,22.0 "train_data = pd.read_csv(path_in + 'train.csv', parse_dates = ['datetime'], index_col='datetime', infer_datetime_format=True) test_data = pd.read_csv(path_in + 'test.csv', parse_dates = ['datetime'], index_col='datetime', infer_datetime_format=True) samp_subm = pd.read_csv(path_in+'sampleSubmission.csv', parse_dates = ['datetime'], index_col='datetime', infer_datetime_format=True)",No,5,45.0 "# Parameters num_months_per_year = 12 year_list = [2011, 2012]",No,5,77.0 "month = 5 year = 2011 start_date = datetime.datetime(year, month, 1, 0, 0, 0) end_date = datetime.datetime(year, month, 19, 23, 0, 0) # train_data['count_log'] = np.log1p(train_data['count']) # train_data['rolling_mean'] = train_data['count'].rolling(window = 24).mean() # train_data['rolling_std'] = train_data['count'].rolling(window = 24).std() ",Yes,2,77.0 "train_data = pd.get_dummies(train_data) test_data = pd.get_dummies(test_data)",No,5,20.0 "scaler = MinMaxScaler() train_data[scale_features] = scaler.fit_transform(train_data[scale_features]) test_data[scale_features] = scaler.transform(test_data[scale_features])",Yes,2,4.0 "import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory import os print(os.listdir(""../input"")) # Any results you write to the current directory are saved as output.",Yes,5,88.0 "# Additional Libraries import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from xgboost.sklearn import XGBRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import accuracy_score from keras.callbacks import ModelCheckpoint from keras.models import Sequential from keras.layers import Dense, Activation, Flatten from keras.layers import Dropout import lightgbm as lgb from sklearn.metrics import mean_squared_error from math import sqrt",No,5,22.0 "b""def define_data(data , info=True ,shape = True, percentage =True,describe = True , sample=True , columns = False):\n if columns == True:\n print('\\nColumns of Data...')\n print(data.columns)\n return \n if shape ==True:\n print('Shape of Data is...')\n print(data.shape)\n if info==True:\n print('\\nInfo of Data...')\n print(data.info())\n if percentage ==True:\n print('\\nPercentage of Data Missing ...')\n print((data.isnull().sum()/data.shape[0])*100)\n if describe == True:\n print('\\nDescription of data...')\n display(data.describe())\n if sample == True:\n print('\\nSample of Data...')\n display(data.sample(10).T)\n \n\ndefine_data(train)""",No,4,40.0 "define_data(train , columns = True) define_data(test , columns = True)",No,4,40.0 "# Divide DateTime Column to various Columns def add_dates(data , column , suffix='time_' , year = True , month = True , day = False ,dayofweek = True, hour = True , minute = False , second = False , date = False , time = False): data['add_date_date_time'] = pd.to_datetime(data[column]) if year == True: data[suffix+'year']=data['add_date_date_time'].dt.year if month == True: data[suffix+'month']=data['add_date_date_time'].dt.month if day == True: data[suffix+'day']=data['add_date_date_time'].dt.day if hour == True: data[suffix+'hour']=data['add_date_date_time'].dt.hour if minute == True: data[suffix+'minute']=data['add_date_date_time'].dt.minute if date == True: data[suffix+'date']=data['add_date_date_time'].dt.date if time == True: data[suffix+'time']=data['add_date_date_time'].dt.time if second == True: data[suffix+'second']=data['add_date_date_time'].dt.second if dayofweek == True: data[suffix+'dayofweek']=data['add_date_date_time'].dt.dayofweek data = data.drop(columns = ['add_date_date_time'] , axis =1) return data train = add_dates(train , column = 'datetime') define_data(train , columns = True)",No,5,16.0 "b""def unique_count(data , columns = []):\n for col in columns :\n print('Unique Data Percentage in ',col)\n print((data[col].value_counts()/data.shape[0])*100)\n print('\\n')\nunique_count(train , columns = ['season','weather','time_year', 'time_dayofweek'])""",No,4,54.0 "b""def display_unique_data(data):\n for i in data.columns:\n unique_cols_data = data[i].unique()\n if len(unique_cols_data)<20:\n print('Correct Type on Column -> ',i)\n print('Unique data in this Column is -> ',unique_cols_data)\n print('\\n')\ndisplay_unique_data(train)""",No,5,57.0 "display(train.corr().style.format(""{:.2%}"").highlight_min()) # f,ax = plt.subplots(figsize=(15, 15)) # sns.heatmap(train.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)'",No,4,80.0 "def joint_plots(data , col,columns = []): plt.figure(figsize=(16,16)) for i in columns: sns.jointplot( x=col , y=i , data=data , height=10, ratio=3 , color='g') plt.show() joint_plots(train , columns = ['temp' , 'atemp' ,'humidity' , 'windspeed' ] , col = 'count')",No,5,33.0 "def plot_bar(data, col , feature=[]): length = len(feature)*4 plt.figure(figsize=(20,length)) for i,j in zip(feature,range(1,len(feature)*2-1,2)): plt.subplot(10,2,j) #fig = plt.figure(figsize=(9,8)) sns.barplot(x=i, y=col, data=data, palette='Set2',orient='v') plt.plot() plt.subplot(10,2,j+1) sns.boxplot(x=i, y=col, data=data, palette='Set2' , width=.4) plt.plot()",No,5,33.0 "plot_bar(train, col = 'count',feature =['time_hour','time_month','time_dayofweek','time_year','weather', 'holiday' , 'workingday' , 'season' ])",No,5,33.0 "b""def new_col_categorical(data , columns = [] , remove_original = True):\n for i in columns:\n unique_cols = data[i].unique()\n if len(unique_cols) < 20:\n print('\\nCorrect Type on Column -> ',i)\n print('Unique data in this Column is -> ',unique_cols)\n else:\n return data\n if remove_original == False:\n original_data = data[columns]\n data = pd.get_dummies(data , columns = columns)\n if remove_original == False:\n data = pd.concat([data,original_data] , axis=1)\n return data\n """,No,3,20.0 "train = new_col_categorical(train,columns=['season','weather','time_year', 'time_dayofweek' , 'time_month','time_hour_group'] , remove_original = False)",No,5,8.0 "define_data(train, columns = True ) # train_x_new = train.drop(columns =['datetime','count', 'season_1','casual','registered', # 'season_2', 'season_3', 'season_4', 'weather_1', 'weather_2', # 'weather_3', 'weather_4', 'time_year_2011', 'time_year_2012', # 'time_dayofweek_0', 'time_dayofweek_1', 'time_dayofweek_2', # 'time_dayofweek_3', 'time_dayofweek_4', 'time_dayofweek_5', # 'time_dayofweek_6', 'time_month_1', 'time_month_2', 'time_month_3', # 'time_month_4', 'time_month_5', 'time_month_6', 'time_month_7', # 'time_month_8', 'time_month_9', 'time_month_10', 'time_month_11', # 'time_month_12'] , axis = 1) train_x_new = train.drop(columns =['datetime','count', 'casual','registered', 'season', 'weather', 'time_year', 'time_dayofweek', 'time_month','time_hour_group'] , axis = 1) train_y_new = train['count'] define_data(train_x_new, columns = True )",Yes,5,21.0 "# Processing Test Data test = add_dates(test , column = 'datetime') test['time_hour_group'] = test['time_hour'].apply(hour_group).astype(str) test = new_col_categorical(test,columns=['season','weather','time_year', 'time_dayofweek' , 'time_month','time_hour_group'] , remove_original = False) test['weekend'] = test['time_dayofweek_5']+test['time_dayofweek_6'] # test_x_new = test.drop(columns =['datetime', 'season_1', # 'season_2', 'season_3', 'season_4', 'weather_1', 'weather_2', # 'weather_3', 'weather_4', 'time_year_2011', 'time_year_2012', # 'time_dayofweek_0', 'time_dayofweek_1', 'time_dayofweek_2', # 'time_dayofweek_3', 'time_dayofweek_4', 'time_dayofweek_5', # 'time_dayofweek_6', 'time_month_1', 'time_month_2', 'time_month_3', # 'time_month_4', 'time_month_5', 'time_month_6', 'time_month_7', # 'time_month_8', 'time_month_9', 'time_month_10', 'time_month_11', # 'time_month_12'] , axis = 1) test_x_new = test.drop(columns =['datetime', 'season', 'weather', 'time_year', 'time_dayofweek', 'time_month','time_hour_group'] , axis = 1)",No,3,8.0 "print('For Train Data .. ') define_data(train_x_new, columns = True ) print('For Test Data .. ') define_data(test_x_new , columns = True )",No,5,40.0 "scaler = MinMaxScaler() train_x_new = scaler.fit_transform(train_x_new) train_y_new = np.log1p(train_y_new)",Yes,5,18.0 "test_x_new_1 = scaler.transform(test_x_new) test_x_new_2 = scaler.fit_transform(test_x_new)",No,5,18.0 "X_train , X_test , Y_train , Y_test = train_test_split(train_x_new , train_y_new , test_size = .15 , random_state = 65 )",No,5,13.0 "valid_0_error =0 valid_1_error =0",No,5,77.0 "%%time params = { ""objective"" : ""regression"", ""metric"" : ""mae"", ""num_leaves"" : 60, ""learning_rate"" : 0.01, ""bagging_fraction"" : 0.9, ""bagging_seed"" : 0, ""num_threads"" : 4, ""colsample_bytree"" : 0.5, 'lambda_l2':9 } model = lgb.train( params, train_set = train_set, num_boost_round=10000, early_stopping_rounds=200, verbose_eval=100, valid_sets=[train_set,val_set] ) '",No,4,7.0 "%%time lgb_pred_test = model.predict(X_test, num_iteration=model.best_iteration) lgb_pred_train = model.predict(X_train, num_iteration=model.best_iteration) lgb_pred_normal = model.predict(test_x_new_1, num_iteration=model.best_iteration) lgb_pred_fit = model.predict(test_x_new_2, num_iteration=model.best_iteration)",Yes,5,48.0 "# print(lgb_pred) # print(np.array(Y_test)) valid_0_error_new = sqrt(mean_squared_error(np.array(Y_train),lgb_pred_train)) valid_1_error_new = sqrt(mean_squared_error(np.array(Y_test),lgb_pred_test)) score_diff(valid_0_error , valid_1_error , valid_0_error_new , valid_1_error_new) valid_0_error = valid_0_error_new valid_1_error = valid_1_error_new",No,3,49.0 lgb.plot_importance(model),No,5,79.0 "%%time n_estimators=100 xgb = XGBRegressor(n_estimators=n_estimators,max_depth=4,learning_rate =0.01 , booster = 'gbtree') xgb.fit(X_train ,Y_train ,eval_set=[(X_train, Y_train), (X_test, Y_test)] , verbose = False) score = xgb.evals_result() valid_0_error_new = np.amin(score['validation_0']['rmse']) valid_1_error_new = np.amin(score['validation_1']['rmse']) score_diff(valid_0_error , valid_1_error , valid_0_error_new , valid_1_error_new) valid_0_error = valid_0_error_new valid_1_error = valid_1_error_new",Yes,3,7.0 "%%time model = RandomForestRegressor(random_state=65, n_estimators=200, min_samples_split=4) result = model.fit(X_train, Y_train)",Yes,5,7.0 "model.score(X_test, Y_test)",No,5,49.0 "%%time n_estimators=3000 xgb = XGBRegressor(n_estimators=n_estimators,max_depth=4,learning_rate =0.01 , booster = 'gbtree') xgb.fit(train_x_new , train_y_new ,eval_set=[(X_train, Y_train), (X_test, Y_test)] , verbose = False) pred_normal = xgb.predict(test_x_new_1) pred_fit = xgb.predict(test_x_new_2)",Yes,4,7.0 "%%time model = RandomForestRegressor(random_state=65, n_estimators=n_estimators-2000) model.fit(train_x_new , train_y_new) rfr_pred_normal = model.predict(test_x_new_1) rfr_pred_fit = model.predict(test_x_new_2)",Yes,3,7.0 "%%time NN_model = Sequential() # The Input Layer : NN_model.add(Dense(128, kernel_initializer='normal',input_dim = train_x_new.shape[1], activation='relu')) # The Hidden Layers : NN_model.add(Dense(256, kernel_initializer='normal',activation='relu')) NN_model.add(Dropout(0.3)) NN_model.add(Dense(256, kernel_initializer='normal',activation='relu')) NN_model.add(Dropout(0.3)) NN_model.add(Dense(256, kernel_initializer='normal',activation='relu')) NN_model.add(Dropout(0.3)) # The Output Layer : NN_model.add(Dense(1, kernel_initializer='normal',activation='linear')) # Compile the network : NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error']) NN_model.summary() history = NN_model.fit(train_x_new,train_y_new, epochs=50, batch_size=64, verbose=1, validation_split=0.2) start_point = 150-100 r = range(start_point,150) plt.figure(figsize=(16,8)) plt.plot( history.history['loss'] ,'r' ,label ='Train') plt.plot( history.history['val_loss'] , 'g' , label = 'Test' ) plt.legend(fontsize='x-large') valid_0_error_new = history.history['loss'][-1] valid_1_error_new = history.history['val_loss'][-1] score_diff(valid_0_error , valid_1_error , valid_0_error_new , valid_1_error_new) valid_0_error = valid_0_error_new valid_1_error = valid_1_error_new ",Yes,5,84.0 "ANN_pred_normal = NN_model.predict(test_x_new_1) ANN_pred_fit = NN_model.predict(test_x_new_2) ANN_pred_normal = np.expm1(ANN_pred_normal) ANN_pred_fit = np.expm1(ANN_pred_fit) ANN_pred_fit = ANN_pred_fit.reshape(6493) ANN_pred_normal = ANN_pred_normal.reshape(6493)",No,4,48.0 "output = pd.DataFrame({'datetime': test.datetime,'count': pred_normal}) output.to_csv('xgb_pred_normal.csv', index=False) output = pd.DataFrame({'datetime': test.datetime,'count': pred_fit}) output.to_csv('xgb_pred_fit.csv', index=False) output = pd.DataFrame({'datetime': test.datetime,'count': rfr_pred_normal}) output.to_csv('rfr_pred_normal.csv', index=False) output = pd.DataFrame({'datetime': test.datetime,'count': rfr_pred_fit}) output.to_csv('rfr_pred_fit.csv', index=False) output = pd.DataFrame({'datetime': test.datetime,'count': ANN_pred_normal}) output.to_csv('ANN_pred_normal.csv', index=False) output = pd.DataFrame({'datetime': test.datetime,'count': ANN_pred_fit}) output.to_csv('ANN_pred_fit.csv', index=False) output = pd.DataFrame({'datetime': test.datetime,'count': lgb_pred_normal}) output.to_csv('lgb_pred_normal.csv', index=False) output = pd.DataFrame({'datetime': test.datetime,'count': lgb_pred_fit}) output.to_csv('lgb_pred_fit.csv', index=False)",No,5,25.0 "train = pd.read_csv(""../input/train.csv"", parse_dates = [""datetime""]) test = pd.read_csv(""../input/test.csv"", parse_dates = [""datetime""]) train[""year""] = train[""datetime""].dt.year train[""hour""] = train[""datetime""].dt.hour train[""dayofweek""] = train[""datetime""].dt.dayofweek test[""year""] = test[""datetime""].dt.year test[""hour""] = test[""datetime""].dt.hour test[""dayofweek""] = test[""datetime""].dt.dayofweek",No,3,45.0 "train.info() # train.shape, train.isnull().sum(), train.dtypes '",No,5,40.0 "train.describe() # , , '",No,5,40.0 "b""train['temp'].value_counts().sort_index() # (binning)""",No,5,72.0 "import seaborn as sns import matplotlib.pylab as plt _, axes = plt.subplots(1,1, figsize = (20,12)) sns.boxplot(x=train[""hour""], y=train[""count""])",Yes,5,75.0 "fig, axes = plt.subplots(3,1, figsize = (20,12)) sns.countplot(train[""season""], ax = axes[0], palette=""Set1"") sns.countplot(train[""weather""], ax = axes[1], palette=""Set1"") sns.countplot(train[""windspeed""], ax = axes[2])",No,5,33.0 "fig, axes = plt.subplots(3,1, figsize = (20,12)) sns.countplot(train[""season""], ax = axes[0], palette=""Set1"") sns.countplot(train[""weather""], ax = axes[1], palette=""Set1"") sns.countplot(train[""windspeed""], ax = axes[2]) plt.xticks(rotation = 60, )",No,5,33.0 "y_casual = np.log1p(train.casual) y_registered = np.log1p(train.registered) #y_train = np.log1p(train[""count""]) train.drop([""datetime"", ""windspeed"", ""casual"", ""registered"", ""count""], 1, inplace=True) test.drop([""datetime"", ""windspeed"", ], 1, inplace=True)",Yes,5,21.0 "import lightgbm as lgb hyperparameters = { 'colsample_bytree': 0.725, 'learning_rate': 0.013, 'num_leaves': 56, 'reg_alpha': 0.754, 'reg_lambda': 0.071, 'subsample': 0.523, 'n_estimators': 1093} model = lgb.LGBMRegressor(**hyperparameters) model.fit(train, y_casual) preds1 = model.predict(test) hyperparameters = { 'colsample_bytree': 0.639, 'learning_rate': 0.011, 'num_leaves': 30, 'reg_alpha': 0.351, 'reg_lambda': 0.587, 'subsample': 0.916, 'n_estimators': 2166} model = lgb.LGBMRegressor(**hyperparameters, ) model.fit(train, y_registered) preds2 = model.predict(test) submission=pd.read_csv(""../input/sampleSubmission.csv"") submission[""count""] = np.expm1(preds1) + np.expm1(preds2) #submission.to_csv(""allrf.csv"", index=False)'",Yes,3,7.0 submission.iloc[6332:6354],No,5,14.0 "submission.to_csv(""lgb.csv"", index=False)",No,5,25.0 "import pandas as pd import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns import os from scipy import stats import missingno as msno plt.style.use('seaborn') import warnings warnings.filterwarnings(""ignore"") mpl.rcParams['axes.unicode_minus'] = False %matplotlib inline # import . # mlp.rcParams['axes.unicode_minus'] = False .",Yes,5,23.0 os.listdir('../input/'),No,4,88.0 "df_train = pd.read_csv(""../input/bike-sharing-demand/train.csv"", parse_dates = [""datetime""]) df_test = pd.read_csv(""../input/bike-sharing-demand/test.csv"", parse_dates = [""datetime""])",No,5,45.0 "b""for col in df_train.columns:\n msperc = 'column: {:>10}\\t Percent of NaN value: {:.2f}%'.format(col, 100 * (df_train[col].isnull().sum() / df_train[col].shape[0]))\n print(msperc)""",No,5,39.0 "b""for col in df_test.columns:\n msperc = 'column: {:>10}\\t Percent of NaN value: {:.2f}%'.format(col, 100 * (df_test[col].isnull().sum() / df_test[col].shape[0]))\n print(msperc)""",No,5,39.0 "msno.matrix(df_train, figsize=(12,5))",No,5,34.0 "df_train[""year""] = df_train[""datetime""].dt.year df_train[""month""] = df_train[""datetime""].dt.month df_train[""day""] = df_train[""datetime""].dt.day df_train[""hour""] = df_train[""datetime""].dt.hour df_train[""minute""] = df_train[""datetime""].dt.minute df_train[""second""] = df_train[""datetime""].dt.second df_test[""year""] = df_test[""datetime""].dt.year df_test[""month""] = df_test[""datetime""].dt.month df_test[""day""] = df_test[""datetime""].dt.day df_test[""hour""] = df_test[""datetime""].dt.hour df_test[""minute""] = df_test[""datetime""].dt.minute df_test[""second""] = df_test[""datetime""].dt.second df_train.shape # datetime ,,,,, . # column 18 .",Yes,4,8.0 "figure, ((ax1,ax2,ax3),(ax4,ax5,ax6)) = plt.subplots(nrows = 2, ncols = 3) figure.set_size_inches(18,10) sns.barplot(data=df_train, x = ""year"", y = ""count"", ax = ax1) sns.barplot(data=df_train, x = ""month"", y = ""count"", ax = ax2) sns.barplot(data=df_train, x = ""day"", y = ""count"", ax = ax3) sns.barplot(data=df_train, x = ""hour"", y = ""count"", ax = ax4) sns.barplot(data=df_train, x = ""minute"", y = ""count"", ax = ax5) sns.barplot(data=df_train, x = ""second"", y = ""count"", ax = ax6) ax1.set(ylabel = ""count"", title = ""Rental amount by year"") ax2.set(ylabel = ""count"", title = ""Rental amount by month"") ax3.set(ylabel = ""count"", title = ""Rental amount by day"") ax4.set(ylabel = ""count"", title = ""Rental amount by hour"") # barplot . # 11 12 . # . # 1~19 . test . # 8 5, 6 , .",No,5,75.0 "df_train[""dayofweek""] = df_train[""datetime""].dt.dayofweek df_test[""dayofweek""] = df_test[""datetime""].dt.dayofweek df_train.shape # . # column 19 .",Yes,4,8.0 "df_train[""dayofweek""].value_counts() # 0~6 . # 0 = ~ 6 = . # 5,6(, ) workingday Boxplot .",No,5,72.0 "fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(nrows = 5) fig.set_size_inches(18,25) sns.pointplot(data = df_train, x = ""hour"", y = ""count"", ax = ax1) sns.pointplot(data = df_train, x = ""hour"", y = ""count"", hue = ""workingday"", ax = ax2) sns.pointplot(data = df_train, x = ""hour"", y = ""count"", hue = ""dayofweek"", ax = ax3) sns.pointplot(data = df_train, x = ""hour"", y = ""count"", hue = ""weather"", ax = ax4) sns.pointplot(data = df_train, x = ""hour"", y = ""count"", hue = ""season"", ax = ax5) # pointplot . # plot . # plot , . # plot plot , , . # plot , , . . # plot , .",No,5,75.0 "corr_data = df_train[[""temp"", ""atemp"", ""casual"", ""registered"", ""humidity"", ""windspeed"", ""count""]] colormap = plt.cm.PuBu f , ax = plt.subplots(figsize = (12,10)) plt.title('Correlation of Numeric Features with Rental Count',y=1,size=18) sns.heatmap(corr_data.corr(), vmax=.8, linewidths=0.1,square=True,annot=True,cmap=colormap, linecolor=""white"",annot_kws = {'size':14}) # Heatmap . # count registered. test . # casual. # , , . # temp atemp , .",No,5,80.0 "b""fig, (ax1, ax2, ax3) = plt.subplots(ncols = 3, figsize=(12,5))\n\ntemp_scatter_plot = pd.concat([df_train['count'],df_train['temp']],axis = 1)\nsns.regplot(x='temp',y = 'count',data = temp_scatter_plot,scatter= True, fit_reg=True, ax=ax1)\nwindspeed_scatter_plot = pd.concat([df_train['count'],df_train['windspeed']],axis = 1)\nsns.regplot(x='windspeed',y = 'count',data = windspeed_scatter_plot,scatter= True, fit_reg=True, ax=ax2)\nhumidity_scatter_plot = pd.concat([df_train['count'],df_train['humidity']],axis = 1)\nsns.regplot(x='humidity',y = 'count',data = humidity_scatter_plot,scatter= True, fit_reg=True, ax=ax3)\n\n# Scatterplot .\n# windspeed 0 . \n# 0 Null 0 . """,No,5,33.0 "fig, axes = plt.subplots(nrows = 2, figsize = (18,14)) plt.sca(axes[0]) plt.xticks(rotation = 30, ha = ""right"") axes[0].set(ylabel = ""count"", title = ""train windspeed"") sns.countplot(data = df_train, x = ""windspeed"", ax = axes[0]) plt.sca(axes[1]) plt.xticks(rotation = 30, ha = ""right"") axes[1].set(ylabel = ""count"", title = ""test windspeed"") sns.countplot(data = df_test, x = ""windspeed"", ax = axes[1]) # . # 0 . # Feature engineering .",No,5,33.0 "def concatenate_year_month(datetime): return ""{0}-{1}"".format(datetime.year, datetime.month) df_train[""year_month""] = df_train[""datetime""].apply(concatenate_year_month) df_test[""year_month""] = df_test[""datetime""].apply(concatenate_year_month) print(df_train.shape) df_train[[""datetime"", ""year_month""]].head() # year month . '",Yes,4,8.0 "fig, ax = plt.subplots(figsize = (18,4)) sns.barplot(data = df_train, y = ""count"", x = ""year_month"") # 2011 2012 , . # 2012 , . # .",No,5,75.0 df_train.loc[Outliers_to_drop],No,5,14.0 "df_train = df_train.drop(Outliers_to_drop, axis = 0).reset_index(drop=True) df_train.shape",Yes,4,10.0 "df_train_num = df_train[[""count"", ""temp"", ""atemp"", ""casual"", ""registered"", ""humidity"", ""windspeed""]] for col in df_train_num: print('{:15}'.format(col), 'Skewness: {:05.2f}'.format(df_train[col].skew()) , ' ' , 'Kurtosis: {:06.2f}'.format(df_train[col].kurt()) ) # '",No,5,40.0 "fig, axes = plt.subplots(nrows = 5, ncols = 2, figsize=(16, 18)) sns.boxplot(data = df_train, y=""count"", x = ""season"", orient = ""v"", ax = axes[0][0]) sns.boxplot(data = df_train, y=""count"", x = ""holiday"", orient = ""v"", ax = axes[0][1]) sns.boxplot(data = df_train, y=""count"", x = ""workingday"", orient = ""v"", ax = axes[1][0]) sns.boxplot(data = df_train, y=""count"", x = ""weather"", orient = ""v"", ax = axes[1][1]) sns.boxplot(data = df_train, y=""count"", x = ""dayofweek"", orient = ""v"", ax = axes[2][0]) sns.boxplot(data = df_train, y=""count"", x = ""month"", orient = ""v"", ax = axes[2][1]) sns.boxplot(data = df_train, y=""count"", x = ""year"", orient = ""v"", ax = axes[3][0]) sns.boxplot(data = df_train, y=""count"", x = ""hour"", orient = ""v"", ax = axes[3][1]) sns.boxplot(data = df_train, y=""count"", x = ""minute"", orient = ""v"", ax = axes[4][0]) axes[0][0].set(ylabel = ""count"", title = ""Rental count by season"") axes[0][1].set(ylabel = ""count"", title = ""Rental count by holiday"") axes[1][0].set(ylabel = ""count"", title = ""Rental count by workingday"") axes[1][1].set(ylabel = ""count"", title = ""Rental count by weather"") axes[2][0].set(ylabel = ""count"", title = ""Rental count by dayofweek"") axes[2][1].set(ylabel = ""count"", title = ""Rental count by month"") axes[3][0].set(ylabel = ""count"", title = ""Rental count by year"") axes[3][1].set(ylabel = ""count"", title = ""Rental count by hour"") axes[4][0].set(ylabel = ""count"", title = ""Rental count by minute"")",No,5,33.0 "f, ax = plt.subplots(1, 1, figsize = (10,6)) g = sns.distplot(df_train[""count""], color = ""b"", label=""Skewness: {:2f}"".format(df_train[""count""].skew()), ax=ax) g = g.legend(loc = ""best"") print(""Skewness: %f"" % df_train[""count""].skew()) print(""Kurtosis: %f"" % df_train[""count""].kurt()) # '",Yes,5,40.0 "df_train[""count_Log""] = df_train[""count""].map(lambda i:np.log(i) if i>0 else 0) f, ax = plt.subplots(1, 1, figsize = (10,6)) g = sns.distplot(df_train[""count_Log""], color = ""b"", label=""Skewness: {:2f}"".format(df_train[""count_Log""].skew()), ax=ax) g = g.legend(loc = ""best"") print(""Skewness: %f"" % df_train['count_Log'].skew()) print(""Kurtosis: %f"" % df_train['count_Log'].kurt()) df_train.drop('count', axis= 1, inplace=True) # .",Yes,4,20.0 "trainWind0 = df_train.loc[df_train[""windspeed""] == 0] trainWindNot0 = df_train.loc[df_train[""windspeed""] != 0] # 0 0 .",No,5,14.0 "from sklearn.ensemble import RandomForestClassifier # RandomForest . def predict_windspeed(data): dataWind0 = data.loc[data[""windspeed""] == 0] dataWindNot0 = data.loc[data[""windspeed""] != 0] # 0 . wcol = [""season"", ""weather"", ""humidity"", ""day"", ""temp"", ""atemp""] # . dataWindNot0[""windspeed""] = dataWindNot0[""windspeed""].astype(""str"") # 0 string . rf_wind = RandomForestClassifier() rf_wind.fit(dataWindNot0[wcol], dataWindNot0[""windspeed""]) wind0 = rf_wind.predict(X=dataWind0[wcol]) # wcol 0 . predictWind0 = dataWind0 predictWindNot0 = dataWindNot0 # . predictWind0[""windspeed""] = wind0 # . data = predictWindNot0.append(predictWind0) # 0 . data[""windspeed""] = data[""windspeed""].astype(""float"") # float . data.reset_index(inplace = True) data.drop(""index"", inplace = True, axis = 1) return data '",Yes,2,7.0 "df_train = predict_windspeed(df_train) df_test = predict_windspeed(df_test) fig, (ax1, ax2) = plt.subplots(nrows = 2, figsize = (18,14)) plt.sca(ax1) plt.xticks(rotation = 30, ha = ""right"") ax1.set(ylabel = ""count"", title = ""train windspeed"") sns.countplot(data = df_train, x = ""windspeed"", ax = ax1) plt.sca(ax2) plt.xticks(rotation = 30, ha = ""right"") ax1.set(ylabel = ""count"", title = ""test windspeed"") sns.countplot(data = df_test, x = ""windspeed"", ax = ax2) # , rotation '",Yes,4,33.0 "df_train = pd.get_dummies(df_train, columns = [""weather""], prefix = ""weather"") df_test = pd.get_dummies(df_test, columns = [""weather""], prefix = ""weather"") df_train = pd.get_dummies(df_train, columns = [""season""], prefix = ""season"") df_test = pd.get_dummies(df_test, columns = [""season""], prefix = ""season"") #onehotencoding",No,5,20.0 "corr_data = df_train[[""count_Log"", ""windspeed""]] corr_data.corr()",No,5,40.0 "datetime_test = df_test['datetime'] df_train.drop([""datetime"", ""registered"",""casual"",""holiday"", ""year_month"", ""minute"", ""second""], axis = 1, inplace = True) df_test.drop([""datetime"",""holiday"", ""year_month"", ""minute"", ""second""], axis = 1, inplace = True) '",No,5,10.0 df_test.head(20),No,5,41.0 "from sklearn.model_selection import train_test_split from sklearn import metrics X_train = df_train.drop(""count_Log"", axis = 1).values target_label = df_train[""count_Log""].values X_test = df_test.values X_tr, X_vld, y_tr, y_vld = train_test_split(X_train, target_label, test_size = 0.2, random_state = 2000)",No,4,21.0 "from sklearn.ensemble import GradientBoostingRegressor regressor = GradientBoostingRegressor(n_estimators=2000, learning_rate=0.05, max_depth=4, min_samples_leaf=15, min_samples_split=10, random_state =42) regressor.fit(X_tr,y_tr)",Yes,5,7.0 "y_hat = regressor.predict(X_tr) plt.scatter(y_tr, y_hat, alpha = 0.2) plt.xlabel('Targets (y_tr)',size=18) plt.ylabel('Predictions (y_hat)',size=18) plt.show()",Yes,5,56.0 "y_hat_test = regressor.predict(X_vld) plt.scatter(y_vld, y_hat_test, alpha=0.2) plt.xlabel('Targets (y_vld)',size=18) plt.ylabel('Predictions (y_hat_test)',size=18) plt.show()",Yes,5,56.0 "from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error # for regression from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score # for classification models=[GradientBoostingRegressor()] model_names=['regressor'] rmsle=[] d={} for model in range (len(models)): clf=models[model] clf.fit(X_tr,y_tr) test_pred=clf.predict(X_vld) rmsle.append(np.sqrt(mean_squared_log_error(test_pred,y_vld))) d={'Modelling Algo':model_names,'RMSLE':rmsle} d",Yes,5,4.0 "from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator = regressor, X = X_tr, y = y_tr, cv = 8)",Yes,5,84.0 "use_logvals = 1 pred_xgb = regressor.predict(X_test) sub_xgb = pd.DataFrame() sub_xgb['datetime'] = datetime_test sub_xgb['count'] = pred_xgb if use_logvals == 1: sub_xgb['count'] = np.exp(sub_xgb['count']) sub_xgb.to_csv('xgb.csv',index=False)",Yes,4,55.0 "# read data (train, test) with pd.read_csv(directory) train = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"") train.head(10) #train.info() #train.shape",Yes,4,45.0 "test = pd.read_csv(""/kaggle/input/bike-sharing-demand/test.csv"") test.head(10)",Yes,4,45.0 "b""y = train['count']\n# y . log scaling outlier .\n# MSE(Mean Square Error) // 900 100 800^2 = 6400 ..\n# , outlier log sacling .\ny.sort_values()""",No,3,41.0 "# y . import matplotlib.pyplot as plt import seaborn as sns # wg, dh = plt.subplots(2,1, figsize=(20,12)) # log scaling . sns.distplot(y, ax=dh[0]) # log . sns.distplot(np.log(y), ax=dh[1]) '",No,3,41.0 "b""# , .\n# y x . train test .\n# y = train['count']\n# log scaling y outlier .\ny = np.log(train['count'])\ny # . .""",No,5,8.0 "############## . # 3 . train['datetime'] = train['datetime'].astype('datetime64') # train['datetime'] = pd.to_datetime(train['datetime']) # train = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"", parse_dates = ['datetime']) train.dtypes '",Yes,4,16.0 "test['datetime'] = test['datetime'].astype('datetime64') # test['datetime'] = pd.to_datetime(test['datetime']) # train = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"", parse_dates = ['datetime']) test.dtypes'",Yes,4,16.0 "train['year'] = train['datetime'].dt.year train['weekday'] = train['datetime'].dt.weekday train['hour'] = train['datetime'].dt.hour train.head()",Yes,5,8.0 "test['year'] = test['datetime'].dt.year test['weekday'] = test['datetime'].dt.weekday test['hour'] = test['datetime'].dt.hour test.head()",Yes,4,8.0 "train = train.drop(['datetime', 'casual', 'registered', 'count'], 1) test = test.drop('datetime', 1) ",No,5,10.0 "from xgboost import XGBRegressor xgb = XGBRegressor() xgb.fit(train, y) preds = predict(test)",Yes,4,7.0 "sample = pd.read_csv(""/kaggle/input/bike-sharing-demand/sampleSubmission.csv"") sample.head()",Yes,3,45.0 "b""# np.log train , exp .\nsample['count'] = np.exp(preds)\nsample.head()""",Yes,4,8.0 "sample.to_csv(""sample.csv"", index = False) ",No,5,25.0 "################################################################################################# ############################ EDA INSIGHT ############################### y.sort_values()'",No,3,41.0 "train2 = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"", parse_dates=['datetime']) train2['year'] = train2['datetime'].dt.year train2['month'] = train2['datetime'].dt.month train2['day'] = train2['datetime'].dt.day train2['weekday'] = train2['datetime'].dt.weekday train2['hour'] = train2['datetime'].dt.hour test2 = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv', parse_dates=['datetime']) test2['day'] = test2['datetime'].dt.day # () , # mean . outlier , median #train2.groupby('hour')['count'].mean() ##### media . . train2.groupby('hour')['count'].median() '",Yes,3,8.0 "b""a, b = plt.subplots(2,2,figsize=(20,12))\nsns.boxplot(train2['year'], train2['count'], ax=b[0,1])\nsns.boxplot(train2['month'], train2['count'], ax=b[1,1])\n### day 1~19 .!\nsns.boxplot(train2['day'], train2['count'], ax=b[0,0])\n### outlier ? ( 5/ 2) 5 . \n### count outlier .\nsns.boxplot(train2['hour'], train2['count'], ax=b[1,0])\n """,No,5,75.0 "b""# class .\ntrain2['datetime'].dt.month.value_counts()""",No,4,72.0 "#Let's import the usual suspects import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline",Yes,5,23.0 "#Importing the dataset train = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv') test = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv') data = train.append(test, sort = False) data.head()",Yes,4,45.0 "#Histogram for count sns.set_style('darkgrid') sns.distplot(train['count'], bins = 100, color = 'green') plt.show()",No,5,33.0 "#Boxplot for count import matplotlib.pyplot as plt sns.boxplot(x = 'count', data = train, color = 'mediumpurple') plt.show()",No,5,33.0 "#Data without the outliers in count data = data[~data.isin(outliers)] data = data[data['datetime'].notnull()]",No,5,14.0 "sns.barplot(x = 'season', y = 'count', data = train, estimator = np.average, palette='coolwarm') plt.ylabel('Average Count') plt.show()",No,5,33.0 "sns.barplot(x = 'workingday', y = 'count', data = train, estimator = np.average, palette='colorblind') plt.ylabel('Average Count') plt.show()",No,5,33.0 "sns.barplot(x = 'weather', y = 'count', data = train, estimator = np.average, palette='deep') plt.ylabel('Average Count') plt.show() ",No,5,33.0 "plt.figure(figsize = (10,7)) tc = train.corr() sns.heatmap(tc, annot = True, cmap = 'coolwarm', linecolor = 'white', linewidths=0.1)",No,5,80.0 "#Convert to integer variables columns=['season', 'holiday', 'workingday', 'weather'] for i in columns: data[i] = data[i].apply(lambda x : int(x))",No,5,8.0 "#Convert string to datatime and create Hour, Month and Day of week data['datetime'] = pd.to_datetime(data['datetime']) data['Hour'] = data['datetime'].apply(lambda x:x.hour) data['Month'] = data['datetime'].apply(lambda x:x.month) data['Day of Week'] = data['datetime'].apply(lambda x:x.dayofweek)",No,4,8.0 "plt.figure(figsize = (8,4)) sns.lineplot(x = 'Month', y = 'count', data = data, estimator = np.average, hue = 'weather', palette = 'coolwarm') plt.ylabel('Average Count') plt.show()",No,5,75.0 data[data['weather'] == 4],No,5,14.0 "fig, axes = plt.subplots(ncols = 2, figsize = (15,5), sharey = True) sns.pointplot(x = 'Hour', y = 'count', data = data, estimator = np.average, hue = 'workingday', ax = axes[0], palette = 'muted') sns.pointplot(x = 'Hour', y = 'count', data = data, estimator = np.average, hue = 'holiday', ax = axes[1], palette = 'muted') ax = [0,1] for i in ax: axes[i].set(ylabel='Average Count')",No,5,75.0 "plt.figure(figsize = (10,4)) sns.pointplot(x = 'Hour', y = 'count', data = data, estimator=np.average, hue = 'Day of Week', palette='coolwarm')",No,5,75.0 "sns.jointplot(x = 'atemp', y = 'count', data = data, kind = 'kde', cmap = 'plasma') plt.show()",No,3,33.0 "plt.figure(figsize = (8,4)) sns.pointplot(x = 'Hour', y = 'casual', data = data, estimator = np.average, color = 'blue') sns.pointplot(x = 'Hour', y = 'registered', data = data, estimator = np.average, color = 'red') plt.ylabel('Registered') plt.show()",No,5,75.0 "#Histogram for Windspeed sns.set_style('darkgrid') sns.distplot(data['windspeed'], bins = 100, color = 'purple') #Windspeed cannot be 0. plt.show()",No,5,33.0 "#Replacing 0s in windspeed with the mean value grouped by season data['windspeed'] = data['windspeed'].replace(0, np.nan) data['windspeed'] = data['windspeed'].fillna(data.groupby('weather')['season'].transform('mean')) sns.distplot(data['windspeed'], bins = 100, color = 'red') plt.show()",Yes,3,17.0 "#Encoding cyclical features data['Month_sin'] = data['Month'].apply(lambda x: np.sin((2*np.pi*x)/12)) data['Month_cos'] = data['Month'].apply(lambda x: np.cos((2*np.pi*x)/12)) data['Hour_sin'] = data['Hour'].apply(lambda x: np.sin((2*np.pi*(x+1))/24)) data['Hour_cos'] = data['Hour'].apply(lambda x: np.cos((2*np.pi*(x+1))/24)) data['DayOfWeek_sin'] = data['Day of Week'].apply(lambda x: np.sin((2*np.pi*(x+1))/7)) data['DayOfWeek_cos'] = data['Day of Week'].apply(lambda x: np.cos((2*np.pi*(x+1))/7))",No,5,8.0 "#trainsforming target variable using log transformation data['count'] = np.log(data['count'])",No,5,8.0 "#Converting Categorical to numerical - Removing Co-Linearity data_ = pd.get_dummies(data=data, columns=['season', 'holiday', 'workingday', 'weather']) train_ = data_[pd.notnull(data_['count'])].sort_values(by=[""datetime""]) test_ = data_[~pd.notnull(data_['count'])].sort_values(by=[""datetime""])'",No,4,9.0 "#Standardizing numerical variables from sklearn.preprocessing import StandardScaler cols = ['temp','atemp','humidity', 'windspeed', 'Month_sin', 'Month_cos', 'Hour_sin', 'Hour_cos', 'DayOfWeek_sin','DayOfWeek_cos'] features = data[cols] #Standard Scaler scaler = StandardScaler().fit(features.values) data[cols] = scaler.transform(features.values)",Yes,5,18.0 "from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn import metrics",No,5,22.0 "#train test split X = train_[cols] y = train_['count'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)",Yes,5,13.0 "lm = LinearRegression() lm.fit(X_train, y_train) print(lm.intercept_)",Yes,5,7.0 "plt.figure(figsize = (18,4)) coeff = pd.DataFrame(lm.coef_, index = X.columns, columns = ['Coefficient']) sns.barplot(x = coeff.index, y = 'Coefficient', data = coeff, color = 'red')",No,5,79.0 "plt.figure(figsize = (8,4)) pred = lm.predict(X_test) sns.scatterplot(x = y_test, y = pred) plt.xlabel('Count') plt.ylabel('Predictions') plt.show()",No,5,56.0 "sns.distplot((y_test-pred),bins=100, color = 'gray') plt.show()",No,4,33.0 "print('RMSLE:', np.sqrt(metrics.mean_squared_log_error(np.exp(y_test), np.exp(pred))))",No,5,49.0 "from sklearn.linear_model import Ridge #Assiging different sets of alpha values to explore which can be the best fit for the model. temp_msle = {} for i in np.linspace(0, 40, 20): ridge = Ridge(alpha= i, normalize=True) #fit the model. ridge.fit(X_train, y_train) ## Predicting the target value based on ""Test_x"" pred = ridge.predict(X_test) msle = np.sqrt(metrics.mean_squared_log_error(np.exp(y_test), np.exp(pred))) temp_msle[i] = msle",Yes,5,2.0 "from sklearn.linear_model import Lasso ## Assiging different sets of alpha values to explore which can be the best fit for the model. temp_msle = {} for i in np.logspace(-10, -1, 20): ## Assigin each model. lasso = Lasso(alpha= i, normalize=True, tol = 0.1) ## fit the model. lasso.fit(X_train, y_train) ## Predicting the target value based on ""Test_x"" pred = lasso.predict(X_test) msle = np.sqrt(metrics.mean_squared_log_error(np.exp(y_test), np.exp(pred))) temp_msle[i] = msle",Yes,5,2.0 "from sklearn.ensemble import RandomForestRegressor rfr = RandomForestRegressor(n_estimators = 500) rfr.fit(X_train, y_train)",Yes,5,7.0 "plt.figure(figsize = (8,4)) pred = rfr.predict(X_test) sns.scatterplot(x = y_test, y = pred) plt.xlabel('Count') plt.ylabel('Predictions') plt.show()",No,5,56.0 "sns.distplot((y_test-pred),bins=100, color = 'gray')",No,3,33.0 "#RMSLE print('RMSLE:', np.sqrt(metrics.mean_squared_log_error(np.exp(y_test), np.exp(pred))))",No,5,49.0 "#submission new = test_[cols] pred = rfr.predict(new) submission = pd.DataFrame({'datetime':test['datetime'],'count':np.exp(pred)}) submission['count'] = submission['count'].astype(int) submission.to_csv('submission.csv',index=False)",Yes,4,25.0 "b""import pandas as pd\nimport numpy as np\n\nimport matplotlib as mpl\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n# \n%matplotlib inline\n\n# \nmpl.rcParams['axes.unicode_minus']=False\n\nimport warnings\nwarnings.filterwarnings('ignore')""",Yes,5,23.0 "train=pd.read_csv(""../input/bike-sharing-demand/train.csv"", parse_dates=[""datetime""]) train.shape ",Yes,4,45.0 "test=pd.read_csv(""../input/bike-sharing-demand/test.csv"", parse_dates=[""datetime""]) test.shape",Yes,4,45.0 "train[""year""]=train[""datetime""].dt.year train[""month""]=train[""datetime""].dt.month train[""day""]=train[""datetime""].dt.day train[""hour""]=train[""datetime""].dt.hour train[""minute""]=train[""datetime""].dt.minute train[""second""]=train[""datetime""].dt.second train[""dayofweek""]=train[""datetime""].dt.dayofweek train.shape",Yes,5,8.0 "test[""year""]=test[""datetime""].dt.year test[""month""]=test[""datetime""].dt.month test[""day""]=test[""datetime""].dt.day test[""hour""]=test[""datetime""].dt.hour test[""minute""]=test[""datetime""].dt.minute test[""second""]=test[""datetime""].dt.second test[""dayofweek""]=test[""datetime""].dt.dayofweek test.shape",Yes,5,8.0 "# widspeed 0 . => fig, axes = plt.subplots(nrows=2) fig.set_size_inches(18,10) plt.sca(axes[0]) plt.xticks(rotation=30, ha='right') axes[0].set(ylabel='Count',title=""train windspeed"") sns.countplot(data=train, x=""windspeed"", ax=axes[0]) plt.sca(axes[1]) plt.xticks(rotation=30, ha='right') axes[1].set(ylabel='Count',title=""test windspeed"") sns.countplot(data=test, x=""windspeed"", ax=axes[1])'",No,5,33.0 "b""# 0 .\ntrainWind0 = train.loc[train['windspeed'] == 0]\ntrainWindNot0 = train.loc[train['windspeed'] != 0]\nprint(trainWind0.shape)\nprint(trainWindNot0.shape)""",Yes,4,14.0 "# . from sklearn.ensemble import RandomForestClassifier def predict_windspeed(data): # 0 . dataWind0 = data.loc[data['windspeed'] == 0] dataWindNot0 = data.loc[data['windspeed'] != 0] # . wCol = [""season"", ""weather"", ""humidity"", ""month"", ""temp"", ""year"", ""atemp""] # 0 . dataWindNot0[""windspeed""] = dataWindNot0[""windspeed""].astype(""str"") # . rfModel_wind = RandomForestClassifier() # wCol . rfModel_wind.fit(dataWindNot0[wCol], dataWindNot0[""windspeed""]) # 0 . wind0Values = rfModel_wind.predict(X = dataWind0[wCol]) # # . predictWind0 = dataWind0 predictWindNot0 = dataWindNot0 # 0 . predictWind0[""windspeed""] = wind0Values # dataWindNot0 0 . data = predictWindNot0.append(predictWind0) # float . data[""windspeed""] = data[""windspeed""].astype(""float"") data.reset_index(inplace=True) data.drop('index', inplace=True, axis=1) return data'",Yes,2,7.0 "# 0 . train = predict_windspeed(train) # test = predict_windspeed(test) # widspeed 0 fig, ax1 = plt.subplots() fig.set_size_inches(18,6) plt.sca(ax1) # 30 plt.xticks(rotation=30, ha='right') ax1.set(ylabel='Count',title=""train windspeed"") sns.countplot(data=train, x=""windspeed"", ax=ax1)'",Yes,5,56.0 "# feature feature # feature = [""temp"",""humidity"",""windspeed"",""atemp""] # feature type category . weather 1,2,3,4 2=> . # feature one-hot-encodding . categorical_feature_names = [""season"",""holiday"",""workingday"",""weather"", ""dayofweek"",""month"",""year"",""hour""] for var in categorical_feature_names: train[var] = train[var].astype(""category"") test[var] = test[var].astype(""category"")'",No,5,16.0 "# dateset X_train = train[feature_names] print(X_train.shape) X_train.head()'",Yes,4,41.0 "X_test = test[feature_names] print(X_test.shape) X_test.head()",Yes,4,41.0 "label_name = ""count"" y_train = train[label_name] print(y_train.shape) y_train.head()",Yes,4,41.0 "from sklearn.metrics import make_scorer def rmsle(predicted_values, actual_values): # . predicted_values = np.array(predicted_values) actual_values = np.array(actual_values) # 1 . log_predict = np.log(predicted_values + 1) log_actual = np.log(actual_values + 1) # . difference = log_predict - log_actual # difference = (log_predict - log_actual) ** 2 difference = np.square(difference) # . mean_difference = difference.mean() # . score = np.sqrt(mean_difference) return score rmsle_scorer = make_scorer(rmsle) rmsle_scorer'",No,3,49.0 "from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score k_fold = KFold(n_splits=10, shuffle=True, random_state=0)",Yes,5,84.0 "from sklearn.ensemble import RandomForestRegressor max_depth_list = [] # n_estimators ;; 100 model = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0) model '",Yes,5,4.0 "%time score = cross_val_score(model, X_train, y_train, cv=k_fold, scoring=rmsle_scorer) score = score.mean() # 0 print(""Score= {0:.5f}"".format(score))'",No,4,28.0 "# , ( ) - model.fit(X_train, y_train)'",No,5,7.0 "# predictions = model.predict(X_test) print(predictions.shape) predictions[0:10]'",Yes,4,48.0 "# . fig,(ax1,ax2)= plt.subplots(ncols=2) fig.set_size_inches(12,5) sns.distplot(y_train,ax=ax1,bins=50) ax1.set(title=""train"") sns.distplot(predictions,ax=ax2,bins=50) ax2.set(title=""test"")'",No,4,33.0 "submission = pd.read_csv(""../input/bike-sharing-demand/sampleSubmission.csv"") submission submission[""count""] = predictions print(submission.shape) submission.head()",Yes,3,45.0 "submission.to_csv(""Score_{0:.5f}_sampleSubmission.csv"".format(score), index=False)",No,5,25.0 "#carregar os dados train = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv') teste = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')",No,5,45.0 "#verificando df treino train.info()",No,5,40.0 "#Verificando o df de teste teste.info()",No,5,40.0 "#Transformando o dataframe original na coluna count #vamos usar escala logaritimica train['count'] = np.log(train['count']) ",No,5,8.0 train = train.append(teste),No,3,11.0 "train.head() ",No,5,41.0 train['datetime'] = pd.to_datetime(train['datetime']),No,5,16.0 "#crindo nova coluna usando data e hora train['year'] = train['datetime'].dt.year train['month'] = train['datetime'].dt.month train['day'] = train['datetime'].dt.day train['hour'] = train['datetime'].dt.hour train['dayofweek'] = train['datetime'].dt.dayofweek ",No,5,16.0 "#separando o df de treino e teste #primeiro teste teste = train[train['count'].isnull()]",No,5,13.0 teste.shape,No,5,58.0 "#separando o df de treino e teste #segundo treino treino = train[~train['count'].isnull()]",No,4,13.0 treino.shape,No,5,58.0 "#Separando o df de treino em treino/validao (def = 75/25) from sklearn.model_selection import train_test_split treino, validacao = train_test_split(treino, random_state=42)'",No,5,13.0 "print(treino.shape) treino.head()",Yes,4,41.0 "print(validacao.shape) validacao.head() ",Yes,4,41.0 "#importando from sklearn.tree import DecisionTreeRegressor",No,5,22.0 "#instanciando objeto de decision tree ad = DecisionTreeRegressor(random_state=42)",No,5,4.0 "#treinando o modelo #informar as colunas de entrada e a coluna de resposta (target) ad.fit(treino[usadas], treino['count'])",No,5,7.0 "#prever os dados de validao previsao = ad.predict(validacao[usadas])'",No,5,48.0 "#usando a metrica para validar os dados from sklearn.metrics import mean_squared_error",No,5,22.0 "#instanciar o modelo rf = RandomForestRegressor(random_state=42, n_jobs=1)",No,5,4.0 "#treinando o modelo rf.fit(treino[usadas], treino['count'])",No,5,7.0 "#Fazendo previses em cima dos dados de validao preds = rf.predict(validacao[usadas])'",No,5,48.0 "#verificando o modelo com relao a mtrica #importando a mtrica from sklearn.metrics import mean_squared_error'",No,5,22.0 "b""#aplicando a mtrica\nmean_squared_error(validacao['count'], preds) ** (1/2)\n#0.348204 do professor""",No,5,49.0 "#vamos prever com base nos dados de treino # como o modelo se comporta prevendo em cima de dados conhecidos # o modelo ja esta treinado treino_preds = rf.predict(treino[usadas]) mean_squared_error(treino['count'], treino_preds) ** (1/2)",No,4,49.0 "b""#Gerando as previses para envio ao Kaggle\nteste['count'] = np.exp(rf.predict(teste[usadas]))""",Yes,5,48.0 "#visualizando o arquivo para envio teste[['datetime','count']].head()",No,4,48.0 "#gerando csv teste[['datetime','count']].to_csv('rf.csv', index=False)",No,5,25.0 "from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error",No,5,22.0 "b""# importando as bases \ntreino = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv') # trino dados do dia 1 ao dia 19\nteste = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv') # base de teste do dia 20 ao final do ms""",No,5,45.0 "treino.shape, teste.shape",No,5,58.0 "display(treino.info()) display(teste.info())",No,5,40.0 "b""# Aplicar log na varivel de resposta\ntreino['count'] = np.log(treino['count'])""",No,5,8.0 "# Juntando os dataframes para realizar as modificaes # As observaes do teste ficaram com o campo count nulo # Concatenando as bases para realizar as transformaes nas duas bases de uma vez s treino = treino.append(teste)'",No,5,11.0 "b""# transformando o tipo da varivel datetime em datetime\ntreino['datetime'] = pd.to_datetime(treino['datetime'])""",No,5,16.0 "# Criando novas colunas com a dada e hora (feature engeneering) treino['year'] = treino['datetime'].dt.year treino['month'] = treino['datetime'].dt.month treino['day'] = treino['datetime'].dt.day treino['dayofweek'] = treino['datetime'].dt.dayofweek treino['hour'] = treino['datetime'].dt.hour ",No,5,8.0 "# separando so dataframes teste = treino[treino['count'].isnull()]",No,5,13.0 treino = treino[~treino['count'].isnull()],No,5,14.0 "treino, validacao = train_test_split(treino, random_state=42)",No,5,13.0 "display(treino.info()) display(validacao.info())",No,5,40.0 "b""# selecionando as variveis que sero utilizadas no treinamento\nnao_usadas = ['casual', 'registered', 'count', 'datetime']\n\n# Xriar a lista das colunas de entrada\nusadas = [c for c in treino.columns if c not in nao_usadas]""",No,2,14.0 "# Instanciando o modelo random_forest = RandomForestRegressor(random_state=42, n_jobs=-1)",No,5,4.0 "# Treinando o modelo random_forest.fit(treino[usadas], treino['count'])",No,5,7.0 "# Prevendo os resultados previsao = random_forest.predict(validacao[usadas])",No,5,48.0 "# Avaliando o modelo com o SRMSLE (Square Root Mean Squared Log Error) mean_squared_error(validacao['count'], previsao)**(1/2)",No,5,49.0 "b""# Vamos prever com base nos dados de treino\n# como o modelo se comporta prevendo em cima de dados conhecidos\n# Verificar se est generalizando bem, caso o erro seja zero na base de treino, um forte sinal de overfitting\n\ntreino_preds = random_forest.predict(treino[usadas])\nmean_squared_error(treino['count'], treino_preds) ** (1/2)""",No,4,27.0 "b""# Gerando as previses para envio ao Kaggle\n\nteste['count'] = np.exp(random_forest.predict(teste[usadas]))""",No,5,48.0 "# Gerando o arquivo para submeter ao kaggle teste[['datetime', 'count']].head()",No,5,41.0 "teste[['datetime', 'count']].to_csv('rf.csv' ,index=False)",No,5,25.0 "# verificando o df de treino df.info()",No,5,40.0 "b""# vamos transformar os dados. toda a transformao dever ser replicada nos dados de teste\n\n# aplicar log na varivel de resposta\n\ndf['count'] = np.log(df['count'])""",No,5,8.0 "#apendando os dois para poder fazer a transformao de uma dez s. depois separa df = df.append(test)'",No,5,11.0 "#convertendo a coluna datetime df['datetime'] = pd.to_datetime(df['datetime'])",No,5,16.0 "#criando novas colunas usando data e hora df['year'] = df['datetime'].dt.year df['month'] = df['datetime'].dt.month df['day'] = df['datetime'].dt.day df['hour'] = df['datetime'].dt.hour df['dayofweek'] = df['datetime'].dt.dayofweek",No,5,8.0 "b""# dividir os dados que foram juntados para a transformao - treino e teste. se estiver dado nulo nas tres variveis target\n# pertence ao df de teste\n\n# primeiro os dados de teste\ntest = df[df['count'].isnull()]""",No,5,13.0 "b""# agora os dados de treino o sinal de til a negao quando uma comparao no est envolvida\n\ndf = df[~df['count'].isnull()]""",No,5,14.0 "# dividindo o df de treino # importando o scikitlearn para a diviso da base from sklearn.model_selection import train_test_split'",No,5,22.0 "#dividir 75% treino e 25% validao - padro train, valid = train_test_split(df, random_state=42) '",No,5,13.0 "b""#escolher as colunas que vo ser usadas e as que no\n\n# lista das colunas no usadas\nremoved_cols = ['casual', 'registered', 'count', 'datetime']\n\n#lista das columas de entrada\n\nfeats = [c for c in train.columns if c not in removed_cols]\n""",No,2,14.0 feats,No,3,71.0 "# usando o random forest # importando o modelo from sklearn.ensemble import RandomForestRegressor",No,5,22.0 "# instanciar o modelo rf = RandomForestRegressor(random_state=42,n_jobs=-1) #n_jobs nr de job que rodam e paralalo para dar o fit. -1 #para usar todos os processadores #n_estimator - nro de arvores. o defalt 10 mas mudar para 100 na proxima verso 0.22 '",No,5,4.0 "#treinar o modelo com os dados de treino rf.fit(train[feats], train['count'])",No,5,7.0 "#faznedo as previses em cima dos dados de validao preds = rf.predict(valid[feats])'",No,5,48.0 "b""#aplicando a mtrica\nmean_squared_error(valid['count'], preds) ** (1/2)\n""",No,5,49.0 train_preds = rf.predict(train[feats]),No,5,27.0 "# aplicando nos dados de treino #dados conhecidos mean_squared_error(train['count'], train_preds) ** (1/2)",No,5,28.0 " test['count'] = np.exp(rf.predict(test[feats]))",No,5,48.0 "test[['datetime', 'count']].head()",No,5,41.0 "b""#Aumentar a floresta - 200 rvores\n# instanciar o modelo\nrf2 = RandomForestRegressor(random_state=42,n_jobs=-1, n_estimators=200, min_samples_leaf=5)\n#n_jobs nr de job que rodam e paralalo para dar o fit. -1 #para usar todos os processadores\n#n_estimator - nro de arvores. o defalt 10 mas mudar para 100 na proxima verso 0.22\n\n#treinar o modelo com os dados de treino\n\nrf2.fit(train[feats], train['count'])\n\n#Previses com os dados de validao\npreds2 = rf2.predict(valid[feats])\n\n# Aplicar mtrica sobre os dados de validao\n\nmean_squared_error(valid['count'], preds2)**(1/2)""",Yes,4,7.0 "#Gerando o novo arquivo test['count'] = np.exp(rf2.predict(test[feats])) #visualizando o arquivo para envio test[['datetime', 'count']].to_csv('rf2.csv', index=False)",Yes,5,25.0 "#Pandas Rolling df = df.append(test)",No,5,11.0 "#ordenando o dataframe df.sort_values('datetime', inplace=True)",No,5,9.0 "#Criando a coluna rolling_temp df['rolling_temp'] = df['temp'].rolling(3,min_periods=1).mean()",No,4,8.0 "#Criando a coluna rolling_atemp df['rolling_atemp'] = df['atemp'].rolling(3,min_periods=1).mean()",No,5,8.0 "#Separando os dataframes test = df[df['casual'].isnull()] df = df[~df['casual'].isnull()]",No,5,14.0 "# Dividindo os dados de treino em train and validation train, valid = train_test_split(df, random_state=42)",No,5,13.0 "#escolher as colunas que vo ser usadas #lista das columas de entrada feats = [c for c in train.columns if c not in removed_cols]'",No,3,14.0 "b""#Novo modelo usando colunas roling\n# instanciar o modelo\nrf3 = RandomForestRegressor(random_state=42,n_jobs=-1, n_estimators=200, min_samples_leaf=5)\n#n_jobs nr de job que rodam e paralalo para dar o fit. -1 #para usar todos os processadores\n#n_estimator - nro de arvores. o defalt 10 mas mudar para 100 na proxima verso 0.22\n\n#treinar o modelo com os dados de treino\n\nrf3.fit(train[feats], train['count'])\n\n#Previses com os dados de validao\npreds3 = rf3.predict(valid[feats])\n\n# Aplicar mtrica sobre os dados de validao\n\nmean_squared_error(valid['count'], preds3)**(1/2)""",Yes,3,7.0 "#Gerando o novo arquivo test['count'] = np.exp(rf3.predict(test[feats])) #visualizando o arquivo para envio test[['datetime', 'count']].to_csv('rf3.csv', index=False)",Yes,4,48.0 "# import modules from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import StandardScaler from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso from matplotlib import pyplot as plt from datetime import datetime as dt import seaborn as sns # set graphics dark mode plt.style.use('dark_background') # import dataset trainset = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv') testset = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv') # dataset quick view trainset.head()",Yes,3,45.0 "# create date feature from datetime trainset.insert(1, 'date', pd.DataFrame([x[:10] for x in trainset.datetime])) # create time feature from datetime trainset.insert(2, 'time', pd.DataFrame([x[11:] for x in trainset.datetime])) # convert datetime from string to datetime trainset.date = [dt.strptime(x, '%Y-%m-%d').weekday() for x in trainset.date] # drop datetime column since we created two variables and casual and registered since their value is contained in count trainset.drop(['datetime'], axis = 1, inplace = True) # trainset quick view trainset.head()",Yes,4,16.0 "# get index of the time elements in unique list _, idx = np.unique(trainset.time, return_inverse = True) # replace time feature with the index just computed trainset.time = idx # trainset quick view trainset.head()",No,5,8.0 "# date - count boxplot plt.figure(), sns.boxplot(x = trainset['date'], y = trainset['count'])",No,5,75.0 "# check sum of nulls trainset.isnull().sum()",No,5,39.0 "# draw the pairplot of the variables plt.figure(), sns.pairplot(trainset) # check target boxplot to see outliers plt.figure(), sns.boxplot(trainset['count'])",Yes,5,81.0 "# apply log transform to remove the number of outliers trainset['count'] = np.log(trainset['count']) # repeat pairplot plt.figure(), sns.pairplot(trainset) # repeat boxplot plt.figure(), sns.boxplot(trainset['count'])",Yes,5,81.0 "# variables correlation heatmap plt.figure(figsize = (10,10)), sns.heatmap(trainset.corr())",No,5,80.0 "# remove features highly correlated trainset.drop(['casual','registered','temp'], axis = 1, inplace = True) # graph heatmap again plt.figure(figsize = (10,10)), sns.heatmap(trainset.corr())",Yes,3,10.0 "# group time values into day segments trainset.time = [0 if x >= 0 and x < 6 else(1 if x > 5 and x < 13 else (2 if x > 12 and x < 19 else 3)) for x in trainset.time] # trainset quick view trainset.head()",Yes,4,16.0 "# get original datetime column for submission testdates = testset.datetime # create date feature from datetime testset.insert(1, 'date', pd.DataFrame([x[:10] for x in testset.datetime])) # create time feature from datetime testset.insert(2, 'time', pd.DataFrame([x[11:] for x in testset.datetime])) # convert datetime from string to datetime testset.date = [dt.strptime(x, '%Y-%m-%d').weekday() for x in testset.date] # drop datetime column since we created two variables and casual and registered since their value is contained in count testset.drop(['datetime'], axis = 1, inplace = True) # get index of the time elements in unique list _, idx = np.unique(testset.time, return_inverse = True) # replace time feature with the index just computed testset.time = idx # replace date with weekday testset.date = [1 if x >= 0 and x < 6 else 0 for x in testset.date] # replace feature name testset.rename(columns = {'date':'weekday'}, inplace = True) # remove features highly correlated testset.drop(['temp'], axis = 1, inplace = True) # group time values into day segments testset.time = [0 if x >= 0 and x < 6 else(1 if x > 5 and x < 13 else (2 if x > 12 and x < 19 else 3)) for x in testset.time] # testset quick view testset.head()",Yes,5,8.0 "# features Xtrain = trainset.iloc[:,:-1] Xtest = testset.iloc[:,:] # target ytrain = trainset.iloc[:,-1] # standard scaler sca = StandardScaler().fit(Xtrain) # standarize features Xtrain = sca.transform(Xtrain) Xtest = sca.transform(Xtest) # classifier clf = RandomForestRegressor(random_state = 0) # regularization parameter range param_grid = {'n_estimators': [25, 50, 100], 'max_features': [3, 6]} # grid search grid = GridSearchCV(estimator = clf, scoring = 'neg_mean_squared_log_error', param_grid = param_grid) # training clf.fit(Xtrain, ytrain) # predictions preds = np.round(np.exp(clf.predict(Xtest))) # clip negatives in case there are preds[preds < 0] = 0 # submission pd.DataFrame({'datetime': testdates, 'count': preds}).to_csv('my_submission.csv', index = False)",Yes,3,6.0 "import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from scipy.special import boxcox, inv_boxcox train_df=pd.read_csv('../input/train.csv') train_df.describe()",No,3,45.0 "sns.boxplot(train_df['count']) plt.show() cnt=train_df['count'].values q99=np.percentile(cnt,[99]) train_df=train_df[train_df['count'] '",Yes,5,88.0 "train = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv',parse_dates = [""datetime""])# object train.head(30) # 5 .() . train['hour']=train['datetime'].dt.hour #hour, year train['year'] = train['datetime'].dt.year train['dayofweek']=train['datetime'].dt.dayofweek #weekday train['day']=train['datetime'].dt.day train['month']=train['datetime'].dt.month #train['week']=train['datetime'].dt.week train.head(30)'",Yes,3,8.0 "test = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv',parse_dates = [""datetime""])# (count) . test['hour'] = test['datetime'].dt.hour test['year'] = test['datetime'].dt.year test['dayofweek']=test['datetime'].dt.dayofweek test['day']=test['datetime'].dt.day test['month']=test['datetime'].dt.month #test['week']=test['datetime'].dt.week test.head(30)'",Yes,3,8.0 "b""weekday_df=train[train['workingday']==1] # ==. workingday 1 True => True . \nprint(weekday_df.shape) # \n\nweekend_df=train[train['workingday']==0]\nweekend_df.shape""",No,4,14.0 "b""import matplotlib.pyplot as plt # matplotlib \nimport seaborn as sns # \na,b = plt.subplots(1,1,figsize=(20,12)) #,, \nsns.boxplot(train['hour'],train['count']) #boxplot => (count), (hour) \n# => \n# => 25%\n# => \n# => . . \n# 10 ~15 . => 5 , . => .=> .\n""",Yes,5,75.0 "b""a,b = plt.subplots(1,1,figsize=(20,12))\nsns.distplot(train['count'])# \n# \n# ,\n#1. . \n#2. .( )\n\n# y(train count) \n# train test . \n# ? => . . . \n#=> => y .\n#=> . \n\n""",No,5,33.0 "b""a,b = plt.subplots(1,1,figsize=(20,12)) \nsns.boxplot(train['month'],train['count']) #tree column 1 column . day column . """,No,5,33.0 "a,b = plt.subplots(1,1,figsize=(20,12)) sns.boxplot(weekday_df['hour'],weekday_df['count']) ",No,5,75.0 "a,b = plt.subplots(1,1,figsize=(20,12)) sns.boxplot(weekend_df['hour'],weekend_df['count']) ",No,5,75.0 "figure, (a,b,c,d,e,f) = plt.subplots(nrows=6) figure.set_size_inches(18,25) sns.pointplot(train['hour'],train['count'], ax = a) sns.pointplot(train['hour'],train['count'],hue = train['workingday'], ax = b) sns.pointplot(train['hour'],train['count'],hue = train['holiday'], ax = c) sns.pointplot(train['hour'],train['count'],hue = train['dayofweek'], ax = d) sns.pointplot(train['hour'],train['count'],hue = train['season'], ax = e) sns.pointplot(train['hour'],train['count'],hue = train['weather'], ax = f)",No,5,75.0 "b""print(train.groupby('year')['count'].mean()) # . \ntrain.groupby('year')['count'].median() #. => """,No,2,40.0 "b""train_2011=train[train['year']==2011] #2011 .\ntrain_2011.groupby('month')['count'].mean()\na,b=plt.subplots(1,1,figsize=(20,12))\nsns.boxplot(train_2011['month'],train['count'])""",No,4,33.0 "b""print(train.groupby('dayofweek')['count'].mean()) # 0~6 => ~\ntrain.groupby('holiday')['count'].mean() # ?""",No,3,40.0 "b""train['dayofweek'].value_counts() # . . . """,No,5,72.0 train.dtypes #datetime object. . .,No,5,70.0 "b""train2 = train.drop(['datetime','casual','registered','count','month','day'],axis=1) # datetime, test 3 . . \n# train2 4 . axis=0 -> row . axis=1 -> column . \ntrain2.head()""",Yes,5,10.0 "b""test2 = test.drop(['datetime','month','day'],axis=1) # test datetime \ntest2.head()""",Yes,4,10.0 "b""# # \n# from sklearn.ensemble import RandomForestRegressor \n\n# # \n# rf = RandomForestRegressor(n_estimators=100,random_state=1,n_jobs=4) # / /\n\n#; 10-> => 100 / / \n#random_state => / n_jobs =4 , \n\n# # 100 ? => . train set \n# . test set a=b \n# \n\n# #\n# rf.fit(train2,np.log(train['count']))\n# #\n# result = rf.predict(test2)\n# test['count']= result #test count result . \n# test.head(10)\n\n# # column , . . \n# # \n# #1. .\n# #2. . \n\n\n# 100 . lgbm . \n# lgbm\n# from lightgbm import LGBMRegressor\n# lgb=LGBMRegressor()\n# lgb.fit(train2,np.log(train['count']))\n# result=lgb.predict(test2)\n# test['count']=result\n# test.head()\n\n# lgbm xgboost ? => X. . \n\n#2 xgboost => . 100 .\n# * . . .\n\n# # ***\n# tree =>\n# tree . => max_depth\n#xgb=> max_depth 3 . . \n\n\nfrom xgboost import XGBRegressor\nxgb=XGBRegressor(nthread=4,max_depth=5) # . . nthread=> . n_jobs . CPU .\nxgb.fit(train2,np.log(train['count']))\nresult=xgb.predict(test2)""",Yes,4,7.0 "Sub = pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv') Sub.head()",Yes,4,45.0 "b""Sub['count'] = np.exp(result) # #\nSub.head()""",Yes,5,55.0 "b""Sub.to_csv('20191231.csv',index=False) # index=False index=True . column 2. """,No,5,25.0 "import pandas as pd %matplotlib inline import seaborn as sns import numpy as np import matplotlib.pyplot as plt from scipy.stats import norm import warnings; warnings.simplefilter('ignore') # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename))'",Yes,5,88.0 "train=data=pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv') test=pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv') train.info() Y1train=train['casual'] Y2train=train['registered'] Ytrain=train['count'] ",No,3,45.0 "figure, axs = plt.subplots(nrows=3, ncols=2) figure.set_size_inches(14,6) sns.distplot(Ytrain, ax=axs[0][0], fit=norm) sns.distplot(np.log(Ytrain+1), ax=axs[0][1], fit=norm) sns.distplot(Y1train, ax=axs[1][0], fit=norm) sns.distplot(np.log(Y1train+1), ax=axs[1][1], fit=norm) sns.distplot(Y2train, ax=axs[2][0], fit=norm) sns.distplot(np.log(Y2train+1), ax=axs[2][1], fit=norm)",No,5,33.0 "feature_names=list(test) train=train[feature_names] all_data=pd.concat((train, test)) print(train.shape, test.shape, all_data.shape) print(Ytrain) all_data['datetime']=pd.to_datetime(all_data['datetime']) all_data['year']=all_data['datetime'].dt.year all_data['month']=all_data['datetime'].dt.month all_data['day']=all_data['datetime'].dt.day all_data['hour']=all_data['datetime'].dt.hour all_data['dayofweek']=all_data['datetime'].dt.dayofweek all_data=all_data.drop(columns='datetime') all_data.loc[all_data['windspeed']==0, 'windspeed']=all_data['windspeed'].mean() print(train.shape, test.shape, all_data.shape)",Yes,3,8.0 "Xtrain=all_data[:len(train)] Xtest=all_data[len(train):] Xtrain.info() #"""""" import itertools import copy tmpXtrain = copy.deepcopy(Xtrain) tmpXtest = copy.deepcopy(Xtest) for cmb in itertools.combinations_with_replacement(list(Xtrain.keys()), 2): tmpXtrain[""-"".join(cmb)] = Xtrain[cmb[0]] * Xtrain[cmb[1]] tmpXtest[""-"".join(cmb)] = Xtest[cmb[0]] * Xtest[cmb[1]] #""""""",Yes,4,13.0 !pip install optuna,No,5,87.0 "import optuna.integration.lightgbm as lgb from sklearn.model_selection import train_test_split from sklearn import datasets X_train, X_test, y_train, y_test = train_test_split(tmpXtrain, np.log1p(Y1train), test_size=0.1) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) lgbm_params = { 'objective': 'regression', 'metric': 'rmse', } best_params, tuning_history = dict(), list() booster_casual = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_eval, verbose_eval=0, best_params=best_params, tuning_history=tuning_history) print(""Best Params:"", best_params) print(""Tuning history:"", tuning_history) '",Yes,3,7.0 "X_train, X_test, y_train, y_test = train_test_split(tmpXtrain, np.log1p(Y2train), test_size=0.1) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train) lgbm_params = { 'objective': 'regression', 'metric': 'rmse', } best_params, tuning_history = dict(), list() booster_registered = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_eval, verbose_eval=0, best_params=best_params, tuning_history=tuning_history) print(""Best Params:"", best_params) print(""Tuning history:"", tuning_history)'",Yes,3,13.0 "pred_casual = booster_casual.predict(tmpXtest, num_iteration=booster_casual.best_iteration) pred_casual = np.expm1(pred_casual) pred_registered = booster_registered.predict(tmpXtest, num_iteration=booster_registered.best_iteration) pred_registered = np.expm1(pred_registered) pred = pred_casual + pred_registered pred[pred<0] = 0 submission = pd.DataFrame({'datetime': test.datetime, 'count': pred}, columns=['datetime', 'count']) submission.to_csv(""submission.csv"", index=False)'",Yes,4,25.0 "b""# \nimport pandas as pd\nimport numpy as np\nimport seaborn as sns\nfrom matplotlib import pyplot as plt\nfrom sklearn import metrics\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.model_selection import train_test_split\nfrom pathlib import Path\nfrom IPython.display import display\nfrom datetime import datetime\nfrom pandas import DataFrame\nfrom typing import List, NamedTuple, Tuple\n\n# allow plots to appear directly in the notebook\n%matplotlib \n\n# Supress Warnings\nimport warnings\nwarnings.filterwarnings('ignore')\n\npd.set_option('display.max_rows', 100)\npd.set_option('display.max_columns', 50)\npd.set_option('display.width', 1000)""",Yes,5,23.0 "# def load(path: Path) -> DataFrame: # return pd.read_csv(path) return pd.read_csv(path, parse_dates=True, index_col=""datetime"")'",No,5,45.0 "ROOT_DIR = Path(""/kaggle/input/bike-sharing-demand"") TRAIN_DATA_PATH = ROOT_DIR / ""train.csv"" TEST_DATA_PATH = ROOT_DIR / ""test.csv"" TARGET_NAME = 'count' original_train: DataFrame = load(TRAIN_DATA_PATH) original_test: DataFrame = load(TEST_DATA_PATH)'",No,4,45.0 original_train.head(),No,5,41.0 original_test.head(),No,5,41.0 "only_train_columns = [c1 for c1 in original_train.columns if not c1 in original_test.columns and c1 != TARGET_NAME] print('Only Train Columns') print(only_train_columns) only_test_columns = [c1 for c1 in original_test.columns if not c1 in original_train.columns] print('Only Test Columns') print(only_test_columns)",Yes,3,13.0 original_train.isnull().sum(),No,5,39.0 original_test.isnull().sum(),No,5,39.0 original_train.dtypes,No,5,70.0 original_test.dtypes,No,5,70.0 "feature_columns = [c1 for c1 in original_train.columns if c1 in original_test.columns and c1 != TARGET_NAME] feature_columns",No,4,14.0 sns.distplot(original_train[TARGET_NAME]),No,5,33.0 "def draw_distplot(df, name, fig, m, n, idx): ax = fig.add_subplot(m, n, idx) ax = sns.distplot(df[name]) def draw_distplots(df, columns): M = round(len(columns)/2) N = 2 fig = plt.figure(figsize=[N*10, M*6]) for idx, name in enumerate(columns): draw_distplot(df=df, name=name, fig=fig, m=M, n=N, idx=idx+1) draw_distplots(df=original_train, columns=feature_columns)",No,5,33.0 sns.pairplot(original_train[feature_columns]),No,3,33.0 "sns.heatmap(original_train[feature_columns].corr(), annot=True)",No,5,80.0 "def draw_boxplot(df, x_name, y_name, fig, m, n, idx): ax = fig.add_subplot(m, n, idx) ax = sns.boxplot(data=df, x=x_name, y=y_name) def draw_boxplots(df, x_columns, y_name): M = round(len(x_columns)/2) N = 2 fig = plt.figure(figsize=[N*10, M*6]) for idx, name in enumerate(x_columns): draw_boxplot(df=df, x_name=name,y_name=y_name, fig=fig, m=M, n=N, idx=idx+1) draw_boxplots(df=original_train, x_columns=categorical_feature_columns, y_name=TARGET_NAME)",No,5,33.0 "def draw_scatterplot(df, x_name, y_name, fig, m, n, idx): ax = fig.add_subplot(m, n, idx) ax = sns.scatterplot(data=df, x=x_name, y=y_name) def draw_scatterplots(df, x_columns, y_name): M = round(len(x_columns)/2) N = 2 fig = plt.figure(figsize=[N*10, M*6]) for idx, name in enumerate(x_columns): draw_scatterplot(df=df, x_name=name,y_name=y_name, fig=fig, m=M, n=N, idx=idx+1) draw_scatterplots(df=original_train, x_columns=numeric_feature_columns, y_name=TARGET_NAME)",No,5,33.0 "train_data = original_train.copy() test_data = original_test.copy()",No,5,12.0 "#categorical columns change to one-hot encoding data def replaced_with_onehot_cols(data: DataFrame, col_names: List[str]) -> DataFrame: data = data.copy() for col_name in col_names: one_hot = pd.get_dummies(data[col_name], prefix=col_name) data = data.join(one_hot) # Original column is not needed anymore del data[col_name] return data",No,5,20.0 "train_data = replaced_with_onehot_cols(data=train_data, col_names=categorical_feature_columns) test_data = replaced_with_onehot_cols(data=test_data, col_names=categorical_feature_columns)",No,3,20.0 "#remove only_train_columns train_data = train_data.drop(only_train_columns, axis=1) train_data.head()",Yes,4,10.0 "#seperate datetime index def expanded_index_datetime_col(data: DataFrame) -> DataFrame: data = data.copy() data[""hour""] = data.index.hour data[""weekday""] = data.index.weekday data[""month""] = data.index.month data[""year""] = data.index.year return data",No,5,8.0 "train_data = expanded_index_datetime_col(data=train_data) test_data = expanded_index_datetime_col(data=test_data) train_data.head()",Yes,3,16.0 "#change datetime data to one-hot data datetime_cols = ['hour', 'weekday','month','year'] train_data = replaced_with_onehot_cols(data=train_data, col_names=datetime_cols) test_data = replaced_with_onehot_cols(data=test_data, col_names=datetime_cols) train_data.head()",Yes,4,20.0 "from sklearn.preprocessing import MinMaxScaler def normalize_cols(df: DataFrame, scaler) -> DataFrame: df = df.copy() return DataFrame(scaler.fit_transform(df.values), columns=df.columns, index=df.index) x_scaler = MinMaxScaler() x = train_data.drop(TARGET_NAME, axis=1) x = normalize_cols(df=x, scaler=x_scaler) y_scaler = MinMaxScaler() y = train_data[[TARGET_NAME]] y = normalize_cols(df=y, scaler=y_scaler) test_data = normalize_cols(df=test_data, scaler=x_scaler)",Yes,5,21.0 x.head(),No,5,41.0 y.head(),No,5,41.0 "from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor() result = model.fit(x.values, y.values)",Yes,5,7.0 "features = pd.DataFrame() features[""features""] = x.columns features[""coefficient""] = model.feature_importances_ features.sort_values(by=[""coefficient""], ascending=False, inplace=True) fig,ax= plt.subplots() fig.set_size_inches(20,20) sns.barplot(data=features, x=""coefficient"", y=""features"");",No,5,79.0 "!pip install livelossplot tensorflow-gpu import tensorflow.keras.backend as K import tensorflow as tf",Yes,5,87.0 "#Split train, test data from sklearn.model_selection import train_test_split random_seed = 5 x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.3, shuffle=True, random_state=random_seed) print('train data count : ' + str(len(x_train))) print('test data count : ' + str(len(x_validation)))",Yes,5,13.0 "#make cost function from sklearn import metrics def rmsle_K(y, pred): return K.sqrt(K.mean(K.square(tf.math.log1p(y) - tf.math.log1p(pred)))) def rmsle(y, pred): return np.sqrt(metrics.mean_squared_error(y, pred))",Yes,4,49.0 "#Make DL Models from tensorflow.keras.layers import Input, Dense, Dropout from tensorflow.keras.models import Model from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau _, NUM_FEATURES = x_train.shape def make_dl_model()-> Model: input = Input(shape=(NUM_FEATURES, )) _ = Dense(32, activation='relu')(input) _ = Dropout(0.4)(_) _ = Dense(32, activation='relu')(_) _ = Dropout(0.4)(_) _ = Dense(16, activation='relu')(_) output = Dense(1, activation='relu')(_) model = Model(inputs=input, outputs=output) model.compile(optimizer='adam', loss=rmsle_K, metrics=['mse']) return model ",Yes,4,4.0 "model = make_dl_model() model.fit(x_train,y_train, validation_data=(x_validation,y_validation), epochs=200, batch_size=128, verbose=1, callbacks=[ReduceLROnPlateau(monitor='val_loss', factor=0.4, patience=5, min_lr=0.000001, verbose=1), EarlyStopping(monitor=""val_loss"", patience=10, verbose=0), ] ) results.append(ModelResult(name='DL',cost=rmsle(y_validation, model.predict(x_validation)), model=model))'",Yes,4,7.0 "#Make ML Models from sklearn.linear_model import LinearRegression from sklearn.linear_model import Lasso from sklearn.linear_model import Ridge from sklearn.linear_model import ElasticNet from sklearn.ensemble import RandomForestRegressor from xgboost import XGBRegressor ml_models = { 'LinearRegression': LinearRegression(), 'LassoRegression': Lasso(), 'RidgeRegression': Ridge(), 'ElasticNet': ElasticNet(), 'RandomForestRegressor': RandomForestRegressor(), 'XGBRegressor': XGBRegressor() } #train ML Models for name, model in ml_models.items(): model.fit(x_train, y_train) results.append(ModelResult(name=name,cost=rmsle(y_validation, model.predict(x_validation)), model=model))",Yes,5,3.0 "best_model = sorted_result[0].model y_pred = best_model.predict(test_data) y_pred = y_scaler.inverse_transform(y_pred.reshape(-1, 1)).astype(int)",Yes,4,3.0 "#Save Submission submission = test_data.copy() submission[""datetime""] = test_data.index submission[""count""] = y_pred.astype(int) submission = submission[[""datetime"", ""count""]] submission.to_csv('submission.csv', index=False) '",Yes,5,25.0 "#Import libraries import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline",Yes,5,23.0 "from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = ""all"" ",Yes,5,23.0 "sns.set(style=""dark"") sns.set(style=""whitegrid"", color_codes=True)",No,5,23.0 "train=pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv') test=pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv') print('train shape:',train.shape) print('test shape:',test.shape)",Yes,4,45.0 "#check for null data train.isnull().sum()",No,5,39.0 "import missingno as msno fig,ax=plt.subplots(2,1,figsize=(10,5)) msno.matrix(train,ax=ax[0]) ax[0].set_title('Train Data') msno.matrix(test,ax=ax[1]) ax[1].set_title('Test Data')",Yes,5,34.0 "#variable datatype: train.info()",No,5,40.0 "from datetime import datetime from dateutil import parser import calendar #parse string datetime into datetime format train['datetime2']=train.datetime.apply(lambda x: parser.parse(x)) #Get some different time variables train['year']=train.datetime2.apply(lambda x: x.year) train['month']=train.datetime2.apply(lambda x: x.month) train['weekday']=train.datetime2.apply(lambda x: x.weekday()) train['weekday_name']=train.datetime2.apply(lambda x: calendar.day_name[x.weekday()]) train['hour']=train.datetime2.apply(lambda x: x.hour) ",Yes,5,8.0 "#create categorical data train['season_decode']=train.season.map({1:'spring',2:'summer',3:'fall',4:'winter'}) train['working_decode']=train.workingday.map({1:'work',0:'notwork'}) train['weather_decode']=train.weather.map({1:'Clear',2:'Mist',3:'LightRain',4:'HeavyRain'})",No,5,20.0 "f,ax=plt.subplots(1,2) sns.distplot(train['count'],bins=30,ax=ax[0]) ax[0].set_title('count distrib') sns.boxplot(data=train,y=train['count'],ax=ax[1]) ax[1].set_title('count boxplot')",No,5,33.0 "mean_count=train['count'].mean() std_count=train['count'].std() print(mean_count-3*std_count) print(mean_count+3*std_count) outliers1=train[train['count']>(mean_count+3*std_count)] len(outliers1['count'])",No,3,40.0 "train2=train[train['count']<=(mean_count+3*std_count)] train2.shape",Yes,4,14.0 "#Season sns.boxplot(data=train2,y=train2['count'],x=train['season_decode']).set_title('Demand by season')",No,5,33.0 "#Year train2.groupby(['year','month'])['count'].mean().plot().set_title('demand by year') ",No,5,75.0 "#WeekDay & Hour: week_hour=train2.groupby(['weekday_name','hour'])['count'].mean().unstack() week_hour=week_hour.reindex(index=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']) plt.figure(figsize=(15,6)) cmap2 = sns.cubehelix_palette(start=2,light=1, as_cmap=True) sns.heatmap(week_hour,cmap=cmap2).set_title('Demand by Day-Hour')",Yes,5,80.0 "#Difference between casual and resgitered train2.groupby(['hour'])['casual','registered','count'].mean().plot().set_title('Demand by hour') ",No,5,33.0 " train2.groupby(['weekday_name'])['casual','registered','count'].mean().plot(kind='bar').set_title('demand by day of week') ",No,5,33.0 "#Weather train2.groupby(['weather_decode'])['casual','registered'].mean().plot(kind='bar').set_title('demand by weather')",No,5,33.0 "#Temp season_temp=train2.groupby(['season_decode','temp'])['count'].mean().unstack() plt.figure(figsize=(15,8)) cmap3 = sns.cubehelix_palette(start=6,light=1, as_cmap=True) sns.heatmap(season_temp,cmap=cmap3).set_title('demand by season and temperature')",Yes,5,80.0 "Correlation_Matrix=train2[['holiday','workingday','weather','temp','atemp','humidity','windspeed','casual','registered','count']].corr() mask = np.array(Correlation_Matrix) mask[np.tril_indices_from(mask)] = False fig,ax= plt.subplots() fig.set_size_inches(20,10) sns.heatmap(Correlation_Matrix,mask=mask,vmax=.8,annot=True,square=True)",No,5,80.0 "#preparing data sets for random forest X=train2[['season','holiday','workingday','weather','temp','atemp','humidity','windspeed','year','month','weekday','hour']] y_count=train2['count'] y_casual=train2['casual'] y_reg=train2['registered']",No,5,21.0 "from sklearn.preprocessing import StandardScaler #Scaled all distributions X_Scaled=StandardScaler().fit_transform(X=X)",Yes,5,18.0 "from sklearn.model_selection import train_test_split #Split for train-test X_train, X_test, y_train, y_test = train_test_split(X_Scaled, y_count, test_size=0.25, random_state=42) ",Yes,5,13.0 "from sklearn.ensemble import RandomForestRegressor rf_count=RandomForestRegressor() rf_count.fit(X_train,y_train) importance_count=pd.DataFrame(rf_count.feature_importances_ , index=X.columns, columns=['count']).sort_values(by='count',ascending=False) ",Yes,4,7.0 "importance_count.plot(kind='bar',color='r').set_title('Importance of features for total demand')",No,5,79.0 "#repeat for casual demand: X_train, X_test, y_train, y_test = train_test_split(X_Scaled, y_casual, test_size=0.25, random_state=42) rf_casual=RandomForestRegressor() rf_casual.fit(X_train,y_train) importance_casual=pd.DataFrame(rf_casual.feature_importances_ , index=X.columns, columns=['casual']).sort_values(by='casual',ascending=False) ",Yes,4,7.0 importance_casual.plot(kind='bar').set_title('Importance of features for casual demand'),No,5,79.0 "#repeat for registered demand: X_train, X_test, y_train, y_test = train_test_split(X_Scaled, y_reg, test_size=0.25, random_state=42) rf_reg=RandomForestRegressor() rf_reg.fit(X_train,y_train) importance_reg=pd.DataFrame(rf_reg.feature_importances_ , index=X.columns, columns=['reg']).sort_values(by='reg',ascending=False) ",Yes,4,7.0 "importance_reg.plot(kind='bar',color='g').set_title('Importance of features for registered demand')",No,5,79.0 "importance_df=pd.concat([importance_count,importance_casual,importance_reg],axis=1) importance_df.plot(kind='bar').set_title('Feature importance for each kind of demand')",No,4,79.0 "#Prepare Training data X_train=train2[feature_selection] print(X_train.shape) y_train=train2['count'] print(y_train.shape)",No,4,14.0 "#Prepare Test data #parse string datetime into datetime format test['datetime2']=test.datetime.apply(lambda x: parser.parse(x)) #Get some different time variables test['year']=test.datetime2.apply(lambda x: x.year) test['month']=test.datetime2.apply(lambda x: x.month) test['weekday']=test.datetime2.apply(lambda x: x.weekday()) test['hour']=test.datetime2.apply(lambda x: x.hour) X_test=test[feature_selection] print(X_test.shape)",Yes,4,8.0 "X_train_scaled=StandardScaler().fit_transform(X=X_train) X_test_scaled=StandardScaler().fit_transform(X=X_test)",No,5,18.0 "from sklearn.metrics import mean_squared_log_error from sklearn.metrics import make_scorer def rmsle(y,y_pred): return np.sqrt(mean_squared_log_error(y,y_pred)) rmsle_score=make_scorer(rmsle)",Yes,4,49.0 "from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import cross_val_score rfr=RandomForestRegressor(random_state=42) score=cross_val_score(rfr,X_train_scaled,y_train,cv=15,scoring=rmsle_score) print(f'Score rmsle mean: {np.round(score.mean(),4)}') print(f'Score rmsle std: {np.round(score.std(),4)}')",Yes,5,28.0 "rfr.fit(X_train_scaled,y_train) y_pred=rfr.predict(X_test_scaled)",Yes,4,7.0 "submission=pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv') submission['count']=y_pred submission.to_csv('submissionI.csv',index=False)",Yes,4,25.0 "#Without Scaling Data rfr.fit(X_train,y_train) y_pred=rfr.predict(X_test) submission2=pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv') submission2['count']=y_pred submission2.to_csv('submissionII.csv',index=False) ",Yes,4,7.0 "from sklearn.model_selection import GridSearchCV, train_test_split x_train2,x_test2,y_train2,y_test2=train_test_split(X_train,y_train,test_size=0.25,random_state=42) params={'n_estimators': [10,50,100,300,500], 'n_jobs':[-1], 'max_features':['auto','sqrt','log2'], 'random_state':[42]} rfr_tuned=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params,scoring='neg_mean_squared_log_error',verbose=True) rfr_tuned.fit(x_train2,y_train2) print(rfr_tuned.best_params_) print(rfr_tuned.best_estimator_) ",Yes,4,6.0 "from sklearn.ensemble import RandomForestRegressor rfr_final=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1, oob_score=False, random_state=42, verbose=0, warm_start=False) rfr_final.fit(x_train2,y_train2) y_pred2=rfr_final.predict(x_test2) print('RMSLE:',np.round(rmsle(y_test2,y_pred2),4))",Yes,3,7.0 "rfr_final=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1, oob_score=False, random_state=42, verbose=0, warm_start=False) rfr_final.fit(X_train,y_train) y_pred=rfr.predict(X_test) submission3=pd.read_csv('/kaggle/input/bike-sharing-demand/sampleSubmission.csv') submission3['count']=y_pred submission3.to_csv('submissionIII.csv',index=False)",Yes,3,7.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib from sklearn.ensemble import RandomForestRegressor # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # Any results you write to the current directory are saved as output.",No,5,88.0 "train = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"",parse_dates=[""datetime""]) train.head()",Yes,4,45.0 "test = pd.read_csv(""/kaggle/input/bike-sharing-demand/test.csv"",parse_dates=[""datetime""]) test.head()",Yes,4,45.0 "train.info() test.info()",No,5,40.0 "train[""year""] = train[""datetime""].dt.year train[""month""] = train[""datetime""].dt.month train[""hour""] = train[""datetime""].dt.hour train[""dayofweek""] = train[""datetime""].dt.dayofweek train.shape",Yes,5,8.0 "test[""year""] = test[""datetime""].dt.year test[""month""] = test[""datetime""].dt.month test[""hour""] = test[""datetime""].dt.hour test[""dayofweek""] = test[""datetime""].dt.dayofweek test.shape",Yes,4,8.0 "for var in categorical_feature: train[var] = train[var].astype(""category"") test[var] = test[var].astype(""category"") train.info()",Yes,5,16.0 "X_train = train[feature] X_test = test[feature] X_train.head()",Yes,3,21.0 "Y_train = train[""count""] Y_train.head()",Yes,3,41.0 "model = RandomForestRegressor(n_estimators=500) Y_train_log = np.log1p(Y_train) model.fit(X_train,Y_train_log) result = model.predict(X_test)",Yes,4,7.0 "sub = pd.read_csv(""/kaggle/input/bike-sharing-demand/sampleSubmission.csv"") sub.head()",Yes,4,45.0 "sub[""count""] = np.exp(result) sub.head()",Yes,5,55.0 "sub.to_csv(""20_03_29sub.csv"",index=False)",No,5,25.0 "import calendar import seaborn as sb import xgboost as xgb import plotly.express as px import pandas_profiling as pp import matplotlib.pyplot as plt from plotly.subplots import make_subplots from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression, Ridge from sklearn.metrics import mean_squared_log_error,make_scorer from sklearn.model_selection import train_test_split,GridSearchCV",No,5,22.0 "#Reading the file df_train = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"")",No,5,45.0 pp.ProfileReport(df_train),No,5,40.0 df_train.isnull().sum(axis=0),No,5,39.0 "corr = df_train[['temp','atemp','humidity', 'windspeed','casual', 'registered', 'count']].corr() f,axes = plt.subplots(1,1,figsize = (8,8)) sb.heatmap(corr,square=True,annot = True,linewidth = .5,center = 1.4,ax = axes)",No,5,80.0 "y = ['casual','registered','count'] list_continuous = ['temp','atemp','humidity','windspeed'] n=3 s= 15 f,axes = plt.subplots(4,3,figsize = (s,s)) counter = 0 for i in list_continuous: for j in y: sb.lineplot(x = i , y = j , data = df_train, ax = axes[counter//n][counter%n]) counter+=1",No,4,33.0 "df_train['Date'] = pd.DatetimeIndex(df_train['datetime']).date df_train['Hour'] = pd.DatetimeIndex(df_train['datetime']).hour df_train['Day'] = pd.DatetimeIndex(df_train['datetime']).day df_train['Month'] = pd.DatetimeIndex(df_train['datetime']).month df_train['Year'] = pd.DatetimeIndex(df_train['datetime']).year df_train['Weekday'] = pd.DatetimeIndex(df_train['datetime']).weekday_name",No,5,8.0 "a = [] for i in df_train.index: a.append('Total Count : '+str(df_train['count'][i])) df_train['count_vis'] = a",No,2,12.0 "fig = px.line(x = 'Date', y = ""count"", data_frame = df_train,color = 'Hour', range_y = (0,1150),hover_data = ['Hour','Date','casual','registered'], title = 'Interactive LinePlot of the whole dataset(Hover for more details)', hover_name = 'count_vis', text = None,height = 670,width = 980) fig.show()'",No,5,75.0 "f,axes = plt.subplots(1,3,figsize = (17,7)) sb.despine(left = True) x = 'season' sb.barplot(x = x , y = 'casual' , data = df_train, saturation = 1, ax = axes[0]) sb.barplot(x = x , y = 'registered' , data = df_train, saturation = 1, ax = axes[1]) sb.barplot(x = x , y = 'count' , data = df_train, saturation = 1, ax = axes[2])",No,5,33.0 "f,axes = plt.subplots(1,3,figsize = (17,7)) sb.despine(left = True) x = 'holiday' sb.barplot(x = x , y = 'casual' , data = df_train, saturation = 1, ax = axes[0] ,) sb.barplot(x = x , y = 'registered' , data = df_train, saturation = 1, ax = axes[1]) sb.barplot(x = x , y = 'count' , data = df_train, saturation = 1, ax = axes[2])",No,5,33.0 "f,axes = plt.subplots(1,3,figsize = (17,7)) sb.despine(left = True) x = 'workingday' sb.barplot(x = x , y = 'casual' , data = df_train, saturation = 1, ax = axes[0] ,) sb.barplot(x = x , y = 'registered' , data = df_train, saturation = 1, ax = axes[1]) sb.barplot(x = x , y = 'count' , data = df_train, saturation = 1, ax = axes[2])",No,5,33.0 "f,axes = plt.subplots(1,3,figsize = (17,7)) sb.despine(left = True) x = 'weather' sb.barplot(x = x , y = 'casual' , data = df_train, saturation = 1, ax = axes[0] ) sb.barplot(x = x , y = 'registered' , data = df_train, saturation = 1, ax = axes[1]) sb.barplot(x = x , y = 'count' , data = df_train, saturation = 1, ax = axes[2])",No,5,33.0 "f,axes = plt.subplots(1,3,figsize = (19,7)) sb.despine(left = True) x = 'Hour' sb.barplot(x = x , y = 'casual' , data = df_train, saturation = 1, ax = axes[0] ,) sb.barplot(x = x , y = 'registered' , data = df_train, saturation = 1, ax = axes[1]) sb.barplot(x = x , y = 'count' , data = df_train, saturation = 1, ax = axes[2])",No,5,75.0 df_train.groupby('Weekday').count().index,No,3,60.0 "df_train_temp = df_train.groupby(['Hour','Weekday']).mean().reset_index() dic = {'Weekday':['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday']} dic1 = {'registered':'Average count of registered poeple commuting.', 'count': 'Average people commuting','Hour':'Hour of the day', 'Weekday':'Day of the week'} fig = px.line(x = 'Hour', y = ""registered"", data_frame = df_train_temp.reset_index(), color = 'Weekday',hover_data = ['count'],category_orders = dic, title = 'Interactive LinePlot of the registered separated by weekday(Hover for more details)', labels = dic1,range_y = [0,550],height = 670,width = 980) fig.show()'",No,5,75.0 "df_train_temp = df_train.groupby(['Hour','Weekday']).mean().reset_index() dic = {'Weekday':['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday']} dic1 = {'casual':'Average count of casual poeple commuting.', 'count': 'Average people commuting','Hour':'Hour of the day', 'Weekday':'Day of the week'} fig = px.line(x = 'Hour', y = ""casual"", data_frame = df_train_temp.reset_index(), color = 'Weekday',hover_data = ['count'],category_orders = dic, title = 'Interactive LinePlot of the casual separated by weekday(Hover for more details)', labels = dic1,range_y = [0,550],height = 670,width = 980) fig.show()'",No,5,75.0 "f,axes = plt.subplots(1,3,figsize = (19,7)) sb.despine(left = True) x = 'Day' sb.barplot(x = x , y = 'casual' , data = df_train, saturation = 1, ax = axes[0] ,) sb.barplot(x = x , y = 'registered' , data = df_train, saturation = 1, ax = axes[1]) sb.barplot(x = x , y = 'count' , data = df_train, saturation = 1, ax = axes[2])",No,5,33.0 "f,axes = plt.subplots(1,3,figsize = (19,7)) sb.despine(left = True) x = 'Month' #order = ['January','February','March','April','May','June','July','August','September','October','November','December'] plot = sb.barplot(x = x , y = 'casual' , data = df_train, saturation = 1, ax = axes[0]) sb.barplot(x = x , y = 'registered' , data = df_train, saturation = 1, ax = axes[1]) sb.barplot(x = x , y = 'count' , data = df_train, saturation = 1, ax = axes[2])",No,5,75.0 "f,axes = plt.subplots(1,3,figsize = (19,7)) sb.despine(left = True) x = 'Year' sb.barplot(x = x , y = 'casual' , data = df_train, saturation = 1, ax = axes[0] ,) sb.barplot(x = x , y = 'registered' , data = df_train, saturation = 1, ax = axes[1]) sb.barplot(x = x , y = 'count' , data = df_train, saturation = 1, ax = axes[2])",No,5,33.0 df_train.describe(),No,5,40.0 "for i in df_train.groupby('season').count().index: s = 's'+str(i) a=[] for j in df_train.season: if j==i: a.append(1) else: a.append(0) df_train[s]=a df_train.sample(5)",Yes,3,12.0 "for i in df_train.groupby('weather').count().index: s = 'w'+str(i) a=[] for j in df_train.weather: if j==i: a.append(1) else: a.append(0) df_train[s]=a df_train.sample(5)",Yes,3,12.0 "for i in df_train.groupby('Hour').count().index: s = 'Hour'+str(i) a=[] for j in df_train.Hour: if j==i: a.append(1) else: a.append(0) df_train[s]=a df_train.sample(5)",Yes,3,12.0 "for i in df_train.groupby(""Month"").count().index: s = 'Month' + str(i) a = [] for j in df_train.Month: if j==i: a.append(1) else: a.append(0) df_train[s] = a df_train.sample(5)'",Yes,3,12.0 "df_train = df_train[['Hour0', 'Hour1', 'Hour2', 'Hour3', 'Hour4', 'Hour5', 'Hour6', 'Hour7', 'Hour8', 'Hour9', 'Hour10', 'Hour11', 'Hour12', 'Hour13', 'Hour14', 'Hour15', 'Hour16', 'Hour17', 'Hour18', 'Hour19', 'Hour20', 'Hour21', 'Hour22', 'Hour23','Month1', 'Month2', 'Month3', 'Month4', 'Month5', 'Month6', 'Month7', 'Month8', 'Month9', 'Month10', 'Month11', 'Month12','Year','s1','s2','s3','s4','holiday','workingday', 'w1','w2','w3','w4','temp','humidity','casual','registered']]",No,5,10.0 "df_train_x = df_train.drop('casual',axis = 1).drop('registered',axis=1) df_train_x.describe()",Yes,4,10.0 "df_reg_train_y = df_train['registered'] df_reg_train_y.describe",Yes,5,40.0 "df_cas_train_y = df_train['casual'] df_cas_train_y.describe",Yes,4,40.0 "x1_train, x1_test, y1_train, y1_test = train_test_split(df_train_x, df_reg_train_y, test_size=0.15, random_state=42) x2_train, x2_test, y2_train, y2_test = train_test_split(df_train_x, df_cas_train_y, test_size=0.15, random_state=42)",No,5,13.0 "poly = PolynomialFeatures(degree=2) poly_x1_train = poly.fit_transform(x1_train) poly_x1_test = poly.fit_transform(x1_test) poly_x2_train = poly.fit_transform(x2_train) poly_x2_test = poly.fit_transform(x2_test)",No,5,8.0 "rf = RandomForestRegressor() xg = xgb.XGBRegressor() parameter = {""max_depth"": [1,2,3,4,5,6], ""eta"": [0.01,0.03,0.05], ""alpha"":[0],'n_estimators': [100,500,800,1000,1200,1400]} parameters = {'n_estimators':[50,100,150,200,250], 'min_impurity_decrease':[0.0,0.001,0.01], 'max_depth':[20,40,60,80,100]} models = ['Normal Linear Regression: ','Linear Regression over polynomial: ', 'Random Forest Regressor: ','XG Boosting: ']'",Yes,4,5.0 "def custom_scorer(y_true,y_pred): for i in range(len(y_pred)): if y_pred[i]<0: y_pred[i] = 1 return np.sqrt(mean_squared_log_error(y_true, y_pred )) scorer = make_scorer(custom_scorer,greater_is_better = False)",No,5,84.0 "predict = [] reg = LinearRegression().fit(x1_train, y1_train) pre_reg = reg.predict(x1_test) reg_poly = LinearRegression().fit(poly_x1_train, y1_train) pre_reg_poly = reg_poly.predict(poly_x1_test) rf_reg = GridSearchCV(rf, parameters, cv=5, verbose=2,scoring = scorer,n_jobs = -1) rf_reg.fit(x1_train, y1_train) pre_rf_reg = rf_reg.predict(x1_test) xg_reg = GridSearchCV(xg,parameter,cv=5,verbose = 2 , scoring = scorer, n_jobs = -1) xg_reg.fit(x1_train, y1_train) pre_xg_reg = xg_reg.predict(x1_test) predict.append(pre_reg) predict.append(pre_reg_poly) predict.append(pre_rf_reg) predict.append(pre_xg_reg)",Yes,3,7.0 "for prediction in range(len(predict)): pre = [] for p in predict[prediction]: if p < 1: pre.append(1) else: pre.append(p) print(models[prediction]+str(np.sqrt(mean_squared_log_error(y1_test, pre ))))",No,3,49.0 "predict = [] cas = LinearRegression().fit(x2_train, y2_train) pre_cas = cas.predict(x2_test) cas_poly = LinearRegression().fit(poly_x2_train, y2_train) pre_cas_poly = cas_poly.predict(poly_x2_test) rf_cas = GridSearchCV(rf, parameters, cv=5, verbose=2,scoring = scorer,n_jobs = -1) rf_cas.fit(x2_train, y2_train) pre_rf_cas = rf_cas.predict(x2_test) xg_cas = GridSearchCV(xg,parameter,cv=5,verbose = 2 , scoring = scorer, n_jobs = -1) xg_cas.fit(x2_train, y2_train) pre_xg_cas = xg_cas.predict(x2_test) predict.append(pre_cas) predict.append(pre_cas_poly) predict.append(pre_rf_cas) predict.append(pre_xg_cas)",Yes,3,7.0 "for prediction in range(len(predict)): pre = [] for p in predict[prediction]: if p < 1: pre.append(1) else: pre.append(p) print(models[prediction]+str(np.sqrt(mean_squared_log_error(y2_test, pre ))))",No,3,49.0 "print(""For Random Forest Model: "") print(""\\t Best Parametres for registered are: "",end='') print(rf_reg.best_params_) print(""\\t Best Parametres for casual are: "",end = '') print(rf_cas.best_params_) print(""\ For XGBoost Model: "") print(""\\t Best Parametres for registered are: "",end='') print(xg_reg.best_params_) print(""\\t Best Parametres for casual are: "",end = '') print(xg_cas.best_params_)'",No,2,2.0 "predict1 = [] reg1 = LinearRegression().fit(x1_train, y1_train) pre_reg1 = reg1.predict(x1_test) reg1_poly = LinearRegression().fit(poly_x1_train, y1_train) pre_reg1_poly = reg1_poly.predict(poly_x1_test) rf1 = RandomForestRegressor(n_estimators = 250,min_impurity_decrease = 0.001, max_depth=60).fit(x1_train, y1_train) pre_rf1 = rf1.predict(x1_test) xg1 = xgb.XGBRegressor(alpha = 0, eta = 0.03, n_estimators = 1200, max_depth = 6).fit(x1_train,y1_train) pre_xg1 = xg1.predict(x1_test) for i in range(pre_reg1.size): if pre_reg1[i]<1: pre_reg1[i] = 1 if pre_reg1_poly[i]<1: pre_reg1_poly[i] = 1 if pre_rf1[i]<1: pre_rf1[i] = 1 if pre_xg1[i]<1: pre_xg1[i] = 1 predict1.append(pre_reg1) predict1.append(pre_reg1_poly) predict1.append(pre_rf1) predict1.append(pre_xg1) x1_final = x1_test.copy() x1_final['Output'] = y1_test x1_final['Linear'] = pre_reg1 x1_final['Lin_poly'] = pre_reg1_poly x1_final['RF'] = pre_rf1 x1_final['XG'] = pre_xg1 x1_final['Resid'] = y1_test-pre_reg1 x1_final['Resid_poly'] = y1_test-pre_reg1_poly x1_final['Resid_rf'] = y1_test - pre_rf1 x1_final['Resid_xg'] = y1_test - pre_xg1 for prediction in range(len(predict1)): print(models[prediction]+ str(np.sqrt(mean_squared_log_error(y1_test,predict1[prediction] ))))",Yes,2,7.0 "predict2 = [] reg2 = LinearRegression().fit(x2_train, y2_train) pre_reg2 = reg2.predict(x2_test) reg2_poly = LinearRegression().fit(poly_x2_train, y2_train) pre_reg2_poly = reg2_poly.predict(poly_x2_test) rf2 = RandomForestRegressor(n_estimators = 100,min_impurity_decrease = 0.001, max_depth=40).fit(x2_train, y2_train) pre_rf2 = rf2.predict(x2_test) xg2 = xgb.XGBRegressor(alpha = 0, eta = 0.05, n_estimators = 800, max_depth = 6).fit(x2_train,y2_train) pre_xg2 = xg2.predict(x2_test) for i in range(pre_reg2.size): if pre_reg2[i]<1: pre_reg2[i] = 1 if pre_reg2_poly[i]<1: pre_reg2_poly[i] = 1 if pre_rf2[i]<1: pre_rf2[i] = 1 if pre_xg2[i]<1: pre_xg2[i] = 1 predict2.append(pre_reg2) predict2.append(pre_reg2_poly) predict2.append(pre_rf2) predict2.append(pre_xg2) x2_final = x2_test.copy() x2_final['Output'] = y2_test x2_final['Linear'] = pre_reg2 x2_final['Lin_poly'] = pre_reg2_poly x2_final['RF'] = pre_rf2 x2_final['XG'] = pre_xg2 x2_final['Resid'] = y2_test-pre_reg2 x2_final['Resid_poly'] = y2_test-pre_reg2_poly x2_final['Resid_rf'] = y2_test - pre_rf2 x2_final['Resid_xg'] = y2_test - pre_xg2 for prediction in range(len(predict2)): print(models[prediction]+ str(np.sqrt(mean_squared_log_error(y2_test, predict2[prediction]))))",No,2,7.0 "name1 = ['Residual for casual without polynomial features'] *1633 name2 = ['Residual for casual with polynomial features'] *1633 name3 = ['Residual for registered without polynomial features'] *1633 name4 = ['Residual for registered with polynomial features'] *1633 dic = {'Lin': 'Output Predicted using linear model', 'Lin_poly': 'Output Predicted using polynomial features', 'RF' : 'Output Predicted using RandomForest Model', 'XG': 'Output Predicted using XGBoost Model', 'Resid':'Deviation from predicted','Output':'Expected Output', 'Resid_poly':'Deviation from predicted','Resid_rf':'Deviation from predicted', 'Output':'Expected Output','Resid_xg':'Deviation from predicted'} fig1 = px.scatter(data_frame = x1_final,x = 'Linear', y = 'Resid',hover_data = ['Output'], labels = dic,hover_name = name3,color_discrete_sequence = ['red']) fig2 = px.scatter(data_frame = x1_final,x = 'Lin_poly', y = 'Resid_poly', hover_data = ['Output'],labels = dic,hover_name = name4, color_discrete_sequence = ['blue']) fig3 = px.scatter(data_frame = x2_final,x = 'Linear', y = 'Resid',hover_data = ['Output'], labels = dic,hover_name = name1,color_discrete_sequence = ['darkgreen']) fig4 = px.scatter(data_frame = x2_final,x = 'Lin_poly', y = 'Resid_poly', hover_data = ['Output'],labels = dic,hover_name = name2, color_discrete_sequence = ['gold']) trace1 = fig1['data'][0] trace2 = fig2['data'][0] trace3 = fig3['data'][0] trace4 = fig4['data'][0] fig = make_subplots(rows=2, cols=2,horizontal_spacing =0.1,vertical_spacing = 0.2, row_titles = ['Linear Model','Polynomial Model'], column_titles = ['Casual','Registered'], x_title = 'Residual plots for Registered and Casual under different models (Hover for more details)') fig.add_trace(trace3, row=1, col=1) fig.add_trace(trace4, row=2, col=1) fig.add_trace(trace1, row=1, col=2) fig.add_trace(trace2, row=2, col=2) fig.show()",No,5,56.0 "name5 = ['Residual for casual using RandomForest Model'] *1633 name6 = ['Residual for casual using XGBoost Model'] *1633 name7 = ['Residual for registered using RandomForest Model'] *1633 name8 = ['Residual for registered using XGBoost Model'] *1633 dic = {'Lin': 'Output Predicted using linear model', 'Lin_poly': 'Output Predicted using polynomial features', 'RF' : 'Output Predicted using RandomForest Model', 'XG': 'Output Predicted using XGBoost Model', 'Resid':'Deviation from predicted','Output':'Expected Output', 'Resid_poly':'Deviation from predicted','Resid_rf':'Deviation from predicted', 'Output':'Expected Output','Resid_xg':'Deviation from predicted'} fig5 = px.scatter(data_frame = x1_final,x = 'RF', y = 'Resid_rf',hover_data = ['Output'], labels = dic,hover_name = name7,color_discrete_sequence = ['red']) fig6 = px.scatter(data_frame = x1_final,x = 'XG', y = 'Resid_xg',hover_data = ['Output'], labels = dic,hover_name = name8,color_discrete_sequence = ['blue']) fig7 = px.scatter(data_frame = x2_final,x = 'RF', y = 'Resid_rf',hover_data = ['Output'], labels = dic,hover_name = name5,color_discrete_sequence = ['darkgreen']) fig8 = px.scatter(data_frame = x2_final,x = 'XG', y = 'Resid_xg',hover_data = ['Output'], labels = dic,hover_name = name6,color_discrete_sequence = ['gold']) trace5 = fig5['data'][0] trace6 = fig6['data'][0] trace7 = fig7['data'][0] trace8 = fig8['data'][0] fig = make_subplots(rows=2, cols=2,horizontal_spacing =0.1,vertical_spacing = 0.2, row_titles = ['Random Forest','XGBoost'], column_titles = ['Casual','Registered'], x_title = 'Residual plots for Registered and Casual under different models (Hover for more details)') fig.add_trace(trace5, row=1, col=2) fig.add_trace(trace6, row=2, col=2) fig.add_trace(trace7, row=1, col=1) fig.add_trace(trace8, row=2, col=1) fig.show()",No,5,56.0 "rf1 = RandomForestRegressor(n_estimators = 200,min_impurity_decrease = 0.001, max_depth=80).fit(df_train_x,df_reg_train_y) xg2 = xgb.XGBRegressor(alpha = 0, eta = 0.05, max_depth = 6, n_estimators = 800).fit(df_train_x,df_cas_train_y)",No,5,7.0 df_test = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv'),No,5,45.0 "test=df_test test.describe()",No,5,40.0 "test['mth'] = pd.DatetimeIndex(test['datetime']).month test['Year'] = pd.DatetimeIndex(test['datetime']).year test['dy'] = pd.DatetimeIndex(test['datetime']).day test['hr'] = pd.DatetimeIndex(test['datetime']).hour for i in test.groupby(""season"").count().index: s = 's' + str(i) a = [] for j in test.season: if j==i: a.append(1) else: a.append(0) test[s] = a for i in test.groupby(""weather"").count().index: s = 'w' + str(i) a = [] for j in test.weather: if j==i: a.append(1) else: a.append(0) test[s] = a for i in test.groupby('hr').count().index: s = 'Hour'+str(i) a=[] for j in test.hr: if j==i: a.append(1) else: a.append(0) test[s]=a for i in test.groupby(""mth"").count().index: s = 'Month' + str(i) a = [] for j in test.mth: if j==i: a.append(1) else: a.append(0) test[s] = a test.sample(10)'",Yes,5,8.0 "test = test[['Hour0','Hour1','Hour2','Hour3','Hour4','Hour5','Hour6','Hour7','Hour8', 'Hour9','Hour10','Hour11','Hour12','Hour13','Hour14','Hour15','Hour16', 'Hour17','Hour18','Hour19','Hour20','Hour21','Hour22','Hour23','Month1', 'Month2','Month3','Month4','Month5','Month6','Month7','Month8','Month9', 'Month10','Month11','Month12','Year','s1','s2','s3','s4','holiday', 'workingday','w1','w2', 'w3','w4','temp','humidity']] test.describe",Yes,5,40.0 "pre_reg = rf1.predict(test) pre_cas = xg2.predict(test) final_predictions = pd.DataFrame(pre_cas+pre_reg,columns = ['cout']) final_predictions.describe",Yes,4,48.0 "s=[] for j in final_predictions.cout: if int(j)<1: s.append(1) else: s.append(j) final_predictions['count'] = s ",No,2,78.0 final_predictions.describe,No,5,40.0 "final_predictions['datetime']=df_test['datetime'] final_predictions = final_predictions[['datetime','count']]",No,5,55.0 final_predictions.describe(),No,5,40.0 "final_predictions.to_csv('submission.csv',index=False)",No,5,25.0 "import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline import warnings warnings.filterwarnings(""ignore"", category=FutureWarning) warnings.filterwarnings(""ignore"")",No,5,23.0 "train = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"") test = pd.read_csv(""/kaggle/input/bike-sharing-demand/test.csv"")",No,5,45.0 target = train['count'],No,3,12.0 "from scipy import stats from scipy.stats import norm",No,5,22.0 "sns.distplot(train['count'],fit=norm)",No,5,33.0 "train[""log_count""] = np.log(target+1)",No,5,8.0 "sns.distplot(train[""log_count""], fit=norm)",No,5,33.0 "feature_names=list(test) df_train=train[feature_names] df=pd.concat((df_train, test))",No,4,11.0 "print(train.shape, test.shape, df.shape)",No,5,58.0 import datetime,No,5,22.0 tmp = pd.to_datetime(train['datetime']),No,5,16.0 "df['datetime'] = pd.to_datetime(df['datetime']) df['day'] = df['datetime'].dt.day df['hour'] = df['datetime'].dt.hour df['dayofweek'] = df['datetime'].dt.dayofweek df['month'] = df['datetime'].dt.month df['year'] = df['datetime'].dt.year df['weekend'] = (df['dayofweek'] ==5) | (df['dayofweek'] == 6)",No,5,8.0 "train['datetime'] = pd.to_datetime(train['datetime']) train['day'] = train['datetime'].dt.day train['hour'] = train['datetime'].dt.hour train['dayofweek'] = train['datetime'].dt.dayofweek train['month'] = train['datetime'].dt.month train['year'] = train['datetime'].dt.year train['weekend'] = (train['dayofweek'] ==5) | (train['dayofweek'] == 6)",No,4,8.0 "df.drop(['datetime'], axis=1, inplace=True)",No,5,10.0 "figure, axs = plt.subplots(3,2, figsize = (15,10)) sns.barplot(data=train, x = ""day"", y = target, ax = axs[0][0]) sns.barplot(data=train, x = ""hour"", y = target, ax = axs[0][1]) sns.barplot(data=train, x = ""dayofweek"", y = target, ax = axs[1][0]) sns.barplot(data=train, x = ""weekend"", y = target, ax = axs[1][1]) sns.barplot(data=train, x = ""month"", y = target, ax = axs[2][0]) sns.barplot(data=train, x = ""year"", y = target, ax = axs[2][1])",No,5,75.0 "df=df.drop(columns=['month', 'day'])",No,5,10.0 df,No,5,41.0 "sns.barplot(data=df[:len(train)], x='season', y=target)",No,5,33.0 "season_encoded = pd.get_dummies(df['season'],prefix= 'season') df = pd.concat((df,season_encoded), axis=1) df = df.drop(columns = 'season')",Yes,4,20.0 "sns.barplot(data=df[:len(train)], x='holiday', y=target)",No,5,33.0 "sns.barplot(data=df[:len(train)], x='workingday', y=target)",No,5,33.0 "sns.barplot(data=df[:len(train)], x='weather', y=target) df['weather'] = df['weather']",No,5,33.0 "weather_encoded = pd.get_dummies(df['weather'],prefix= 'weather') df = pd.concat((df,weather_encoded), axis=1) df = df.drop(columns = 'weather')",Yes,4,20.0 "fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(nrows = 5) fig.set_size_inches(20,30) sns.pointplot(data = train, x = ""hour"", y = ""count"", ax = ax1) sns.pointplot(data = train, x = ""hour"", y = ""count"", hue = ""season"", ax = ax2) sns.pointplot(data = train, x = ""hour"", y = ""count"", hue = ""holiday"", ax = ax3) sns.pointplot(data = train, x = ""hour"", y = ""count"", hue = ""workingday"", ax = ax4) sns.pointplot(data = train, x = ""hour"", y = ""count"", hue = ""weather"", ax = ax5)",No,5,75.0 "from scipy.stats import skew skew = df.apply(lambda x: skew(x)) skew.sort_values(ascending = False)",Yes,4,40.0 "skew = skew[abs(skew) > 0.5] skew",No,5,14.0 "cor = train.iloc[:,1:-1].corr() cor.head()",No,5,40.0 "mask = np.array(cor) mask[np.tril_indices_from(mask)] = False fig,ax= plt.subplots() fig.set_size_inches(20,10) sns.heatmap(cor,mask= mask,square=True,annot=True)",No,5,80.0 from statsmodels.stats.outliers_influence import variance_inflation_factor,No,5,22.0 "vif_data = df.iloc[:,:6] vif_data.info()",Yes,4,40.0 "vif = pd.DataFrame() vif['Features'] = vif_data.columns vif['vif'] = [variance_inflation_factor( vif_data.values, i) for i in range(vif_data.shape[1])] vif.sort_values(by='vif',ascending=False)",Yes,4,12.0 "sns.distplot(df['pca'], fit=norm)",No,5,33.0 "fig, [ax1,ax2,ax3] = plt.subplots(1,3) fig.set_size_inches(12,5) sns.regplot(train['temp'], 'count', data = train, ax=ax1) sns.regplot(train['humidity'], 'count', data = train, ax=ax2) sns.regplot(train['windspeed'], 'count', data = train, ax=ax3)",No,5,33.0 "stats.pearsonr(train['temp'],target)",No,4,47.0 "sns.countplot(data = df, x = ""windspeed"")",No,5,33.0 "df = df.drop(columns=['temp','atemp'])",No,5,10.0 "fig, axes = plt.subplots(nrows=3, ncols=2, figsize = (15,20)) sns.boxplot(data = train, y=""count"", x = ""holiday"", orient = ""v"", ax = axes[0][0]) sns.boxplot(data = train, y=""count"", x = ""workingday"", orient = ""v"", ax = axes[0][1]) sns.boxplot(data = train, y=""count"", x = ""hour"", orient = ""v"", ax = axes[1][0]) sns.boxplot(data = train, y=""count"", x = ""dayofweek"", orient = ""v"", ax = axes[1][1]) sns.boxplot(data = train, y=""count"", x = ""year"", orient = ""v"", ax = axes[2][0])",No,5,33.0 "from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV from sklearn.metrics import mean_squared_log_error from sklearn.preprocessing import RobustScaler from sklearn.pipeline import make_pipeline from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from xgboost import XGBRegressor from lightgbm import LGBMRegressor from sklearn.linear_model import ElasticNet, Lasso, LinearRegression",No,5,22.0 "new_train = df[:train.shape[0]] new_test = df[train.shape[0]:]",No,5,13.0 target = train['log_count'],No,5,21.0 "X_train, X_val, y_train, y_val = train_test_split(new_train, target, test_size=0.2, shuffle=True)",No,5,13.0 "def rmsle_score(preds, true): rmsle_score = (np.sum((np.log1p(preds)-np.log1p(true))**2)/len(true))**0.5 return rmsle_score",No,5,84.0 "from sklearn.metrics.scorer import make_scorer RMSLE = make_scorer(rmsle_score)",Yes,5,84.0 import statsmodels.api as sm,No,5,22.0 "lasso = make_pipeline(GridSearchCV(Lasso(random_state=1),param, cv=10, scoring = RMSLE))",No,5,53.0 "lasso.fit(X_train,y_train)",No,5,7.0 la_yhat = lasso.predict(X_val),No,5,48.0 "s_lasso = rmsle_score(la_yhat,y_val) s_lasso",No,5,49.0 pred_la = lasso.predict(new_test),No,5,48.0 "param_e = {'alpha' :[0.1,1.0,10], 'max_iter' :[1000000], 'l1_ratio':[0.04,0.05], 'normalize':[True,False]}",No,5,5.0 "Enet.fit(X_train,y_train)",No,5,7.0 Enet_yhat = Enet.predict(X_val),No,5,48.0 "s_Enet = rmsle_score(Enet_yhat,y_val) s_Enet",No,5,49.0 pred_Enet = Enet.predict(new_test),No,5,48.0 "param_Rf = {'min_samples_split' : [3,4,6,10], 'n_estimators' : [70,100], 'random_state': [5] }",No,5,5.0 "RF = make_pipeline(GridSearchCV(RandomForestRegressor(random_state=1),param_Rf, cv=10, scoring = RMSLE))",No,5,82.0 "RF.fit(X_train,y_train)",No,5,7.0 "RF_yhat = RF.predict(X_val) s_RF = rmsle_score(RF_yhat,y_val) s_RF",No,3,48.0 pred_RF = RF.predict(new_test),No,5,48.0 "param_GB = [{'learning_rate': [1,0.1,0.01,0.001], 'n_estimators': [50, 100, 200, 500, 1000]}]",No,5,5.0 "GB = make_pipeline(GridSearchCV(GradientBoostingRegressor(random_state=1),param_GB, cv=10, scoring = RMSLE))",No,5,4.0 "GB.fit(X_train,y_train)",No,5,7.0 "GB_yhat = GB.predict(X_val) s_GB = rmsle_score(GB_yhat,y_val) s_GB",Yes,3,48.0 pred_GB = GB.predict(new_test),No,5,48.0 "param_lgb = param_grid = [{ 'n_estimators': [400, 700, 1000], 'max_depth': [15,20,25], 'num_leaves': [50, 100, 200], 'min_split_gain': [0.3, 0.4], }]",No,5,5.0 "lgb = make_pipeline(GridSearchCV(LGBMRegressor(verbose_eval=False,random_state=1),param_lgb, cv=10, scoring = RMSLE))",No,3,4.0 "lgb.fit(X_train,y_train)",No,5,7.0 "lgb_yhat = lgb.predict(X_val) s_lgb = rmsle_score(lgb_yhat,y_val) s_lgb",Yes,4,27.0 pred_lgb = lgb.predict(new_test),No,5,48.0 "sns.barplot(x=list_regressors, y=list_scores) plt.ylabel('RMSE')",No,5,33.0 "df_predictions = pd.DataFrame(data=predictions) df_predictions.corr()",Yes,5,40.0 "plt.figure(figsize=(7, 7)) sns.heatmap(df_predictions.corr(),linewidths=1.5, annot=True, square=True, yticklabels=df_predictions.columns , xticklabels=df_predictions.columns) ",No,5,80.0 "RF.fit(new_train,target)",No,5,7.0 "log_pred=RF.predict(new_test) predictions=np.exp(log_pred)-1",No,5,48.0 "sub = pd.DataFrame() sub['datetime'] = test['datetime'] sub['count'] = predictions sub.head()",No,5,55.0 "sub.to_csv('submission.csv', index=False)",No,5,25.0 "lgb.fit(new_train,target)",No,5,7.0 "log_pred_lgb=lgb.predict(new_test) predictions_lgb=np.exp(log_pred_lgb)-1",No,5,48.0 "sub = pd.DataFrame() sub['datetime'] = test['datetime'] sub['count'] = predictions_lgb sub.head()",No,3,41.0 "sub = pd.DataFrame() sub['datetime'] = test['datetime'] sub['count'] = ensemble sub.head()",No,5,55.0 "import numpy as np from sklearn.metrics import confusion_matrix from sklearn.metrics import f1_score from sklearn.metrics import accuracy_score import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error from sklearn.metrics import r2_score from sklearn.preprocessing import PolynomialFeatures from sklearn.tree import DecisionTreeRegressor #from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestRegressor from sklearn.neighbors import KNeighborsRegressor import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from lightgbm import LGBMClassifier from sklearn.linear_model import Ridge from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV import seaborn as sns import warnings warnings.filterwarnings(""ignore"")",No,5,23.0 "filepath= '/kaggle/input/bike-sharing-demand/train.csv' filepath1= '/kaggle/input/bike-sharing-demand/test.csv' testdata= pd.read_csv(filepath1) testdata2=testdata traindata= pd.read_csv(filepath) traindata.head()",Yes,4,45.0 "#We remove the id column in both the training and testing datasets. traindata=traindata.drop('datetime',axis=1) testdata=testdata.drop('datetime',axis=1) #We also remove the casual and registered columns because they are not present in the test dataset traindata=traindata.drop('casual',axis=1) traindata=traindata.drop('registered',axis=1)",No,5,10.0 "b""#Checking the correlation between each column with the Cover_Type\ncorr = numeric_features.corr()\nprint (corr['count'].sort_values(ascending=False), '\\n')\nprint (corr['count'].sort_values(ascending=False))\n""",No,4,40.0 "#We see how various features compare with the Cover type column_names=['temp','atemp','windspeed'] for i in column_names: plt.scatter(x=traindata[i], y=traindata['count']) plt.ylabel('count') plt.xlabel(i) plt.show()",No,5,33.0 "#Lettuce visualize the other columns and see how they relate with counts col=['season','holiday','workingday','weather'] for i in col: sns.factorplot(x=i,y=""count"",data=traindata,kind='bar',size=5,aspect=1.5) '",No,5,33.0 "traindata.temp.unique() fig,axes=plt.subplots(2,2) axes[0,0].hist(x=""temp"",data=traindata,edgecolor=""black"",linewidth=2,color='#ff4125') axes[0,0].set_title(""Variation of temp"") axes[0,1].hist(x=""atemp"",data=traindata,edgecolor=""black"",linewidth=2,color='#ff4125') axes[0,1].set_title(""Variation of atemp"") axes[1,0].hist(x=""windspeed"",data=traindata,edgecolor=""black"",linewidth=2,color='#ff4125') axes[1,0].set_title(""Variation of windspeed"") axes[1,1].hist(x=""humidity"",data=traindata,edgecolor=""black"",linewidth=2,color='#ff4125') axes[1,1].set_title(""Variation of humidity"") fig.set_size_inches(10,10)'",No,5,33.0 "#Now we will visualise the remaining features and compare them with the number of rentals column_names=['season','holiday','workingday','weather'] for i in column_names: feature = traindata.pivot_table(index=i, values='count') feature.plot(kind='bar', color='blue') plt.xlabel(i) plt.ylabel('counts') plt.xticks(rotation=0) plt.show()",No,5,33.0 "#Split the data into train and test y=traindata['count'] x=traindata.drop('count',axis=1) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.70,test_size=0.30, random_state=0)",Yes,5,13.0 " #Linear Regression linearRegressor = LinearRegression() linearRegressor.fit(x_train, y_train) y_predicted = linearRegressor.predict(x_test) mse = mean_squared_error(y_test, y_predicted) r = r2_score(y_test, y_predicted) mae = mean_absolute_error(y_test,y_predicted) print(""Mean Squared Error:"",mse) print(""R score:"",r) print(""Mean Absolute Error:"",mae)",Yes,3,7.0 "#for random forest regresion. (tuning) no_of_test=[500] params_dict={'n_estimators':no_of_test,'n_jobs':[-1],'max_features':[""auto"",'sqrt','log2']} clf_rf=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params_dict,scoring='neg_mean_squared_log_error') clf_rf.fit(x_train,y_train) pred=clf_rf.predict(x_test) mse = mean_squared_error(y_test, pred) r = r2_score(y_test, pred) mae = mean_absolute_error(y_test,pred) print(""Mean Squared Error:"",mse) print(""R score:"",r) print(""Mean Absolute Error:"",mae)'",Yes,3,6.0 "# for KNN (tuning) n_neighbors=[] for i in range (0,50,5): if(i!=0): n_neighbors.append(i) params_dict={'n_neighbors':n_neighbors,'n_jobs':[-1]} clf_knn=GridSearchCV(estimator=KNeighborsRegressor(),param_grid=params_dict,scoring='neg_mean_squared_log_error') clf_knn.fit(x_train,y_train) pred=clf_knn.predict(x_test) mse = mean_squared_error(y_test, pred) r = r2_score(y_test, pred) mae = mean_absolute_error(y_test,pred) print(""Mean Squared Error:"",mse) print(""R score:"",r) print(""Mean Absolute Error:"",mae)'",Yes,3,6.0 "# Thus we can use RandomForest Regresson. no_of_test=[500] params_dict={'n_estimators':no_of_test,'n_jobs':[-1],'max_features':[""auto"",'sqrt','log2']} clf_rf=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params_dict,scoring='neg_mean_squared_log_error') clf_rf.fit(x,y) Prediction=clf_rf.predict(testdata)'",Yes,4,6.0 "predictionlist=Prediction.tolist() counts=testdata2['datetime'].tolist() output=pd.DataFrame(list(zip(counts, predictionlist)), columns=['datetime','count']) output.head() output.to_csv('my_submission(ikeSharingDemand).csv', index=False)",Yes,5,25.0 "# import warnings warnings.filterwarnings('ignore') import numpy as np import pandas as pd import seaborn as sns # import matplotlib.pyplot as plt import calendar from datetime import datetime import os print(os.listdir(""../input""))'",Yes,4,23.0 "b""# \n\ntrain = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')\ntest = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')\n\ntrain.head()""",Yes,4,45.0 """"""" datetime - hourly date + timestamp season - 1 = spring, 2 = summer, 3 = fall, 4 = winter holiday - whether the day is considered a holiday workingday - whether the day is neither a weekend nor holiday weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog temp - temperature in Celsius atemp - ""feels like"" temperature in Celsius humidity - relative humidity windspeed - wind speed casual - number of non-registered user rentals initiated registered - number of registered user rentals initiated count - number of total rentals """""" train.info()'",No,5,40.0 "# test.head()'",No,5,41.0 "b""# \n# split - - \ntrain['tempDate'] = train.datetime.apply(lambda x:x.split())""",No,4,13.0 "# tempDate--yearmonthdayweekday train['year'] = train.tempDate.apply(lambda x:x[0].split('-')[0]) train['month'] = train.tempDate.apply(lambda x:x[0].split('-')[1]) train['day'] = train.tempDate.apply(lambda x:x[0].split('-')[2]) #weekdaycalendardatetime train['weekday'] = train.tempDate.apply(lambda x:calendar.day_name[datetime.strptime(x[0],""%Y-%m-%d"").weekday()]) train['hour'] = train.tempDate.apply(lambda x:x[1].split(':')[0])'",No,4,13.0 "b""# \n\ntrain['year'] = pd.to_numeric(train.year,errors='coerce')\ntrain['month'] = pd.to_numeric(train.month,errors='coerce')\ntrain['day'] = pd.to_numeric(train.day,errors='coerce')\ntrain['hour'] = pd.to_numeric(train.hour,errors='coerce')""",No,5,16.0 "# train.info()'",No,5,40.0 "b""# tempDate\n\ntrain = train.drop('tempDate',axis=1)""",No,5,10.0 "b""# count\n\n#year - count\nfig = plt.figure(figsize=[12,10])\nax1 = fig.add_subplot(2,2,1)\nax1 = sns.barplot(x='year',y='count',data=train.groupby('year')['count'].mean().reset_index())\n\n#month - count\nax2 = fig.add_subplot(2,2,2)\nax2 = sns.barplot(x='month',y='count',data=train.groupby('month')['count'].mean().reset_index())\n\n#day - count\nax3 = fig.add_subplot(2,2,3)\nax3 = sns.barplot(x='day',y='count',data=train.groupby('day')['count'].mean().reset_index())\n\n#hour - count\nax4 = fig.add_subplot(2,2,4)\nax4 = sns.barplot(x='hour',y='count',data=train.groupby('hour')['count'].mean().reset_index())""",No,5,75.0 "#season - count fig = plt.figure(figsize=[12,10]) ax1 = fig.add_subplot(2,2,1) ax1 = sns.barplot(x='season',y='count',data=train.groupby('season')['count'].mean().reset_index()) #holiday - count ax2 = fig.add_subplot(2,2,2) ax2 = sns.barplot(x='holiday',y='count',data=train.groupby('holiday')['count'].mean().reset_index()) #workingday - count ax3 = fig.add_subplot(2,2,3) ax3 = sns.barplot(x='workingday',y='count',data=train.groupby('workingday')['count'].mean().reset_index()) #weather - count ax4 = fig.add_subplot(2,2,4) ax4 = sns.barplot(x='weather',y='count',data=train.groupby('weather')['count'].mean().reset_index())",No,5,33.0 "def badToRight(month): if month in [12,1,2]: return 4 elif month in [3,4,5]: return 1 elif month in [6,7,8]: return 2 elif month in [9,10,11]: return 3 train['season'] = train.month.apply(badToRight)",No,5,8.0 "b""# 1\n\n#season - count\nfig = plt.figure(figsize=[12,10])\nax1 = fig.add_subplot(2,2,1)\nax1 = sns.barplot(x='season',y='count',data=train.groupby('season')['count'].mean().reset_index())\n\n#holiday - count\nax2 = fig.add_subplot(2,2,2)\nax2 = sns.barplot(x='holiday',y='count',data=train.groupby('holiday')['count'].mean().reset_index())\n\n#woikingday - count\nax3 = fig.add_subplot(2,2,3)\nax3 = sns.barplot(x='workingday',y='count',data=train.groupby('workingday')['count'].mean().reset_index())\n\n#weather - count\nax4 = fig.add_subplot(2,2,4)\nax4 = sns.barplot(x='weather',y='count',data=train.groupby('weather')['count'].mean().reset_index())""",No,5,33.0 "# heatmap fig = plt.figure(figsize=[20,20]) ax = sns.heatmap(train.corr(),annot=True,square=True)'",No,5,80.0 "b""# heatmapcount\n\n#hour season - count\nfig = plt.figure(figsize=[12,10])\nax1 = fig.add_subplot(2,2,1)\nax1 = sns.pointplot(x='hour',y='count',hue='season',data=train.groupby(['season','hour'])['count'].mean().reset_index())\n\n#hour holiday - count\nax2 = fig.add_subplot(2,2,2)\nax2 = sns.pointplot(x='hour',y='count',hue='holiday',data=train.groupby(['holiday','hour'])['count'].mean().reset_index())\n\n#hour weekday - count\nax3 = fig.add_subplot(2,2,3)\nax3 = sns.pointplot(x='hour',y='count',hue='weekday',hue_order=['Sunday','Monday','Tuesday','Wendnesday','Thursday','Friday','Saturday'],data=train.groupby(['weekday','hour'])['count'].mean().reset_index())\n\n#hour weather - count\nax4 = fig.add_subplot(2,2,4)\nax4 = sns.pointplot(x='hour',y='count',hue='weather',data=train.groupby(['weather','hour'])['count'].mean().reset_index())""",No,5,75.0 "# train[train.weather==4]'",No,5,14.0 "#month, weather - count fig = plt.figure(figsize=[12,10]) ax1 = fig.add_subplot(2,1,1) ax1 = sns.pointplot(x='month',y='count',hue='weather',data=train.groupby(['weather','month'])['count'].mean().reset_index()) #month count ax2 = fig.add_subplot(2,1,2) ax2 = sns.barplot(x='month',y='count',data=train.groupby('month')['count'].mean().reset_index())",No,5,75.0 """"""" WindspeedWindspeed0 0or0 windspeed """""" # train['weekday']= train.weekday.astype('category') print(train['weekday'].cat.categories)'",Yes,4,16.0 "b""# windspeed\n\nfrom sklearn.ensemble import RandomForestRegressor\n\n# Windspeed0\nwindspeed_0 = train[train.windspeed == 0]\n# Windspeed0\nwindspeed_Not0 = train[train.windspeed != 0]\n\n# Windspeed0\nwindspeed_0_df = windspeed_0.drop(['windspeed','casual','registered','count','datetime'],axis=1)\n\n# Windspeed0\nwindspeed_Not0_df = windspeed_Not0.drop(['windspeed','casual','registered','count','datetime'],axis=1)\nwindspeed_Not0_series = windspeed_Not0['windspeed'] \n\n# 0\nrf = RandomForestRegressor()\nrf.fit(windspeed_Not0_df,windspeed_Not0_series)\n\n# Windspeed0Windspeed\npredicted_windspeed_0 = rf.predict(windspeed_0_df)\n\n# \nwindspeed_0['windspeed'] = predicted_windspeed_0""",Yes,3,7.0 "# train = pd.concat([windspeed_0,windspeed_Not0],axis=0)'",No,5,11.0 "b""# string typedatetime\ntrain.datetime = pd.to_datetime(train.datetime,errors='coerce')""",No,5,16.0 "b""# datetime\ntrain = train.sort_values(by=['datetime'])""",No,5,9.0 "# windspeed fig = plt.figure(figsize=[20,20]) ax = sns.heatmap(train.corr(),annot=True,square=True)'",No,5,80.0 "fig = plt.figure(figsize=[5,5]) sns.distplot(train['windspeed'],bins=np.linspace(train['windspeed'].min(),train['windspeed'].max(),10)) plt.suptitle(""Filled by Random Forest Regressor"") print(""Min value of windspeed is {}"".format(train['windspeed'].min()))'",No,5,33.0 "b""# testtrain\n\ntrain = pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv')\ntest = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv')""",No,5,45.0 "combine = pd.concat([train,test],axis=0) combine.info()",Yes,4,11.0 "combine['tempDate'] = combine.datetime.apply(lambda x:x.split()) combine['weekday'] = combine.tempDate.apply(lambda x: calendar.day_name[datetime.strptime(x[0],""%Y-%m-%d"").weekday()]) combine['year'] = combine.tempDate.apply(lambda x: x[0].split('-')[0]) combine['month'] = combine.tempDate.apply(lambda x: x[0].split('-')[1]) combine['day'] = combine.tempDate.apply(lambda x: x[0].split('-')[2]) combine['hour'] = combine.tempDate.apply(lambda x: x[1].split(':')[0])'",No,4,13.0 "combine['year'] = pd.to_numeric(combine.year,errors='coerce') combine['month'] = pd.to_numeric(combine.month,errors='coerce') combine['day'] = pd.to_numeric(combine.day,errors='coerce') combine['hour'] = pd.to_numeric(combine.hour,errors='coerce')",No,5,16.0 combine.info(),No,5,40.0 combine['season'] = combine.month.apply(badToRight),No,5,8.0 "combine.weekday = combine.weekday.astype('category') combine.weekday.cat.categories = ['5','1','6','0','4','2','3'] dataWind0 = combine[combine['windspeed']==0] dataWindNot0 = combine[combine['windspeed']!=0] dataWind0.columns",Yes,3,14.0 "dataWind0_df = dataWind0.drop(['windspeed','casual','registered','count','datetime','tempDate'],axis=1) dataWindNot0_df = dataWindNot0.drop(['windspeed','casual','registered','count','datetime','tempDate'],axis=1) dataWindNot0_series = dataWindNot0['windspeed'] dataWindNot0_df.head()",Yes,4,10.0 "rf2 = RandomForestRegressor() rf2.fit(dataWindNot0_df,dataWindNot0_series) predicted = rf2.predict(dataWind0_df) print(predicted)",Yes,4,7.0 "dataWind0['windspeed'] = predicted combine = pd.concat([dataWind0,dataWindNot0],axis=0)",No,5,11.0 "b""#\nfor col in categorizational_columns:\n combine[col] = combine[col].astype('category')""",No,5,16.0 "b""# countdatetime\ntrain = combine[pd.notnull(combine['count'])].sort_values(by='datetime')\ntest = combine[~pd.notnull(combine['count'])].sort_values(by='datetime')\n\n# \ndatetimecol = test['datetime']\nyLabels = train['count'] #count\nyLabelsRegistered = train['registered'] #\nyLabelsCasual = train['casual'] #""",Yes,3,21.0 "# columntraintest train = train.drop(drop_columns,axis=1) test = test.drop(drop_columns,axis=1)'",No,5,10.0 """"""" RMSLE RMSLE https://programmers.co.kr/learn/courses/21/lessons/943# RMSLE 0 """""" # y is predict value y_ is actual value def rmsle(y, y_,convertExp=True): if convertExp: y = np.exp(y), y_ = np.exp(y_) log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y])) log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_])) calc = (log1 - log2) ** 2 return np.sqrt(np.mean(calc))'",No,5,84.0 "# # attr from sklearn.linear_model import LinearRegression,Ridge,Lasso lr = LinearRegression() """""" yLabelsnp.lognp.log1p np.log1pnp.log1+ xx0log - np.log1p """""" yLabelslog = np.log1p(yLabels) # lr.fit(train,yLabelslog) # preds = lr.predict(train) #rmsleelementnp.exppredsloglog print('RMSLE Value For Linear Regression: {}'.format(rmsle(np.exp(yLabelslog),np.exp(preds),False)))'",Yes,3,7.0 """"""" GridSearchCV GridSearchCV https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html https://datascienceschool.net/view-notebook/ff4b5d491cc34f94aea04baca86fbef8/ """""" from sklearn.model_selection import GridSearchCV from sklearn import metrics #RidgeL2alpha ridge = Ridge() #Ridge ridge_params = {'max_iter':[3000],'alpha':[0.001,0.01,0.1,1,10,100,1000]} rmsle_scorer = metrics.make_scorer(rmsle,greater_is_better=False) grid_ridge = GridSearchCV(ridge,ridge_params,scoring=rmsle_scorer,cv=5) grid_ridge.fit(train,yLabelslog) preds = grid_ridge.predict(train) print(grid_ridge.best_params_) print('RMSLE Value for Ridge Regression {}'.format(rmsle(np.exp(yLabelslog),np.exp(preds),False)))'",Yes,3,6.0 "#GridSearchCVgrid_ridgecv_result_alpha df = pd.DataFrame(grid_ridge.cv_results_)'",No,5,12.0 "b""#RidgeL1alpha\nlasso = Lasso()\n\nlasso_params = {'max_iter':[3000],'alpha':[0.001,0.01,0.1,1,10,100,1000]}\ngrid_lasso = GridSearchCV(lasso,lasso_params,scoring=rmsle_scorer,cv=5)\ngrid_lasso.fit(train,yLabelslog)\npreds = grid_lasso.predict(train)\nprint('RMSLE Value for Lasso Regression {}'.format(rmsle(np.exp(yLabelslog),np.exp(preds),False)))""",Yes,3,6.0 "rf = RandomForestRegressor() rf_params = {'n_estimators':[1,10,100]} grid_rf = GridSearchCV(rf,rf_params,scoring=rmsle_scorer,cv=5) grid_rf.fit(train,yLabelslog) preds = grid_rf.predict(train) print('RMSLE Value for RandomForest {}'.format(rmsle(np.exp(yLabelslog),np.exp(preds),False)))",Yes,3,6.0 "from sklearn.ensemble import GradientBoostingRegressor gb = GradientBoostingRegressor() gb_params={'max_depth':range(1,11,1),'n_estimators':[1,10,100]} grid_gb=GridSearchCV(gb,gb_params,scoring=rmsle_scorer,cv=5) grid_gb.fit(train,yLabelslog) preds = grid_gb.predict(train) print('RMSLE Value for GradientBoosting {}'.format(rmsle(np.exp(yLabelslog),np.exp(preds),False)))",Yes,4,6.0 "predsTest = grid_gb.predict(test) fig,(ax1,ax2)= plt.subplots(ncols=2) fig.set_size_inches(12,5) sns.distplot(yLabels,ax=ax1,bins=50) sns.distplot(np.exp(predsTest),ax=ax2,bins=50)",No,5,56.0 "submission = pd.DataFrame({ ""datetime"": datetimecol, ""count"": [max(0, x) for x in np.exp(predsTest)] }) submission.to_csv('bike_predictions_gbm_separate_without_fe.csv', index=False)'",Yes,5,25.0 "import calendar import seaborn as sb import xgboost as xgb import plotly.express as px import matplotlib.pyplot as plt from plotly.subplots import make_subplots from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression, Ridge from sklearn.metrics import mean_squared_log_error,make_scorer from sklearn.model_selection import train_test_split,GridSearchCV",No,5,22.0 "#Reading the file file = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"")",No,5,45.0 file.describe(),No,5,40.0 file.isnull().sum(axis=0),No,5,39.0 file.columns,No,5,71.0 "corr = file[['temp','atemp','humidity', 'windspeed','casual', 'registered','count']].corr() f,axes = plt.subplots(1,1,figsize = (8,8)) sb.heatmap(corr,square=True,annot = True,linewidth = .5,center = 1.4,ax = axes)",No,5,80.0 "file = file file['Date'] = pd.DatetimeIndex(file['datetime']).date file['Hour'] = pd.DatetimeIndex(file['datetime']).hour file['Day'] = pd.DatetimeIndex(file['datetime']).day file['Month'] = pd.DatetimeIndex(file['datetime']).month file['Year'] = pd.DatetimeIndex(file['datetime']).year file['Weekday'] = pd.DatetimeIndex(file['datetime']).weekday_name",No,5,8.0 "fig = px.line(x = 'Date', y = ""count"", data_frame = file,color = 'Hour',range_y = (0,1150), title = 'Interactive LinePlot of the whole dataset(Hover for more details)', hover_data = ['Hour','Date','casual','registered'], hover_name = 'count_vis', text = None, height = 670,width = 980) fig.show()'",No,5,75.0 "f,axes = plt.subplots(1,3,figsize = (17,7)) sb.despine(left = True) x = 'season' sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax = axes[0]) sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1]) sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])",No,5,33.0 "f,axes = plt.subplots(1,3,figsize = (17,7)) sb.despine(left = True) x = 'holiday' sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax = axes[0] ,) sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1]) sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])",No,5,33.0 "f,axes = plt.subplots(1,3,figsize = (17,7)) sb.despine(left = True) x = 'workingday' sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax = axes[0] ,) sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1]) sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])",No,5,33.0 "f,axes = plt.subplots(1,3,figsize = (17,7)) sb.despine(left = True) x = 'weather' sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax = axes[0] ,) sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1]) sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])",No,5,33.0 "f,axes = plt.subplots(1,3,figsize = (19,7)) sb.despine(left = True) x = 'Hour' sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax = axes[0] ,) sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1]) sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])",No,5,75.0 "file1 = file.groupby(['Hour','Weekday']).mean().reset_index() dic = {'Weekday':['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']} dic1 = {'registered':'Average count of registered poeple commuting.','count': 'Average people commuting','Hour':'Hour of the day', 'Weekday':'Day of the week'} fig = px.line(x = 'Hour', y = ""registered"", data_frame = file1.reset_index(),color = 'Weekday', title = 'Interactive LinePlot of the registered separated by weekday(Hover for more details)',labels = dic1, hover_data = ['count'],category_orders = dic,range_y = [0,550],height = 670,width = 980) fig.show()'",No,5,75.0 "f,axes = plt.subplots(1,3,figsize = (19,7)) sb.despine(left = True) x = 'Day' sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax = axes[0] ,) sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1]) sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])",No,5,75.0 "f,axes = plt.subplots(1,3,figsize = (19,7)) sb.despine(left = True) x = 'Month' #order = ['January','February','March','April','May','June','July','August','September','October','November','December'] plot = sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax = axes[0]) sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1]) sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])",No,5,75.0 "f,axes = plt.subplots(1,3,figsize = (19,7)) sb.despine(left = True) x = 'Year' sb.barplot(x = x , y = 'casual' , data = file, saturation = 1, ax = axes[0] ,) sb.barplot(x = x , y = 'registered' , data = file, saturation = 1, ax = axes[1]) sb.barplot(x = x , y = 'count' , data = file, saturation = 1, ax = axes[2])",No,5,75.0 "for i in file.groupby('season').count().index: s = 's'+str(i) a=[] for j in file.season: if j==i: a.append(1) else: a.append(0) file[s]=a file.sample(5)",No,5,53.0 "for i in file.groupby('weather').count().index: s = 'w'+str(i) a=[] for j in file.weather: if j==i: a.append(1) else: a.append(0) file[s]=a file.sample(5)",Yes,5,53.0 "for i in file.groupby('Hour').count().index: s = 'Hour'+str(i) a=[] for j in file.Hour: if j==i: a.append(1) else: a.append(0) file[s]=a file.sample(5)",Yes,5,8.0 "for i in file.groupby(""Month"").count().index: s = 'Month' + str(i) a = [] for j in file.Month: if j==i: a.append(1) else: a.append(0) file[s] = a file.sample(5)'",Yes,5,53.0 feed.describe(),No,5,40.0 feed.columns,No,5,71.0 "df_train_x = feed.drop('casual',axis = 1).drop('registered',axis=1) df_train_x.describe()",Yes,4,10.0 "df_reg_train_y = feed['registered'] df_reg_train_y.describe",Yes,5,40.0 "df_cas_train_y = feed['casual'] df_cas_train_y.describe",Yes,4,40.0 "x1_train, x1_test, y1_train, y1_test = train_test_split(df_train_x, df_cas_train_y, test_size=0.15, random_state=42) x2_train, x2_test, y2_train, y2_test = train_test_split(df_train_x, df_reg_train_y, test_size=0.15, random_state=42)",No,5,13.0 "rf = RandomForestRegressor() parameters = {'n_estimators':[50,100,150,200,250], 'min_impurity_decrease':[0.0,0.001,0.01], 'max_depth':[20,40,60,80,100]} models = ['Normal Linear Regression: ','Linear Regression over polynomial: ', 'Decision Tree Regressor: ','XG Boosting: ']",Yes,3,5.0 "predict = [] reg = LinearRegression().fit(x1_train, y1_train) pre_reg = reg.predict(x1_test) reg_poly = LinearRegression().fit(poly_x1_train, y1_train) pre_reg_poly = reg_poly.predict(poly_x1_test) rf_reg = GridSearchCV(rf, parameters, cv=5, verbose=2,scoring = scorer,n_jobs = -1) rf_reg.fit(x1_train, y1_train) pre_rf_reg = rf_reg.predict(x1_test) predict.append(pre_reg) predict.append(pre_reg_poly) predict.append(pre_rf_reg)",Yes,3,7.0 "predict = [] cas = LinearRegression().fit(x2_train, y2_train) pre_cas = cas.predict(x2_test) cas_poly = LinearRegression().fit(poly_x2_train, y2_train) pre_cas_poly = cas_poly.predict(poly_x2_test) rf_cas = GridSearchCV(rf, parameters, cv=5, verbose=2,scoring = scorer,n_jobs = -1) rf_cas.fit(x2_train, y2_train) pre_rf_cas = rf_cas.predict(x2_test) predict.append(pre_cas) predict.append(pre_cas_poly) predict.append(pre_rf_cas)",Yes,3,7.0 "print(""For Random Forest Model: "") print(""\\t Best Parametres for registered are: "",end='') print(rf_reg.best_params_) print(""\\t Best Parametres for casual are: "",end = '') print(rf_cas.best_params_)'",No,2,2.0 "predict1 = [] reg1 = LinearRegression().fit(x1_train, y1_train) pre_reg1 = reg1.predict(x1_test) reg1_poly = LinearRegression().fit(poly_x1_train, y1_train) pre_reg1_poly = reg1_poly.predict(poly_x1_test) rf1 = RandomForestRegressor(n_estimators = 200,max_depth=80,min_impurity_decrease = 0.001).fit(x1_train, y1_train) pre_rf1 = rf1.predict(x1_test) for i in range(pre_reg1.size): if pre_reg1[i]<1: pre_reg1[i] = 1 if pre_reg1_poly[i]<1: pre_reg1_poly[i] = 1 if pre_rf1[i]<1: pre_rf1[i] = 1 predict1.append(pre_reg1) predict1.append(pre_reg1_poly) predict1.append(pre_rf1) x1_final = x1_test.copy() x1_final['Output'] = y1_test x1_final['Lin_reg'] = pre_reg1 x1_final['Lin_reg_poly'] = pre_reg1_poly x1_final['RF_reg'] = pre_rf1 x1_final['Resid'] = y1_test-pre_reg1 x1_final['Resid_poly'] = y1_test-pre_reg1_poly for prediction in predict1: print(np.sqrt(mean_squared_log_error( y1_test, prediction )))",Yes,2,7.0 "predict2 = [] reg2 = LinearRegression().fit(x2_train, y2_train) pre_reg2 = reg2.predict(x2_test) reg2_poly = LinearRegression().fit(poly_x2_train, y2_train) pre_reg2_poly = reg2_poly.predict(poly_x2_test) rf2 = RandomForestRegressor(n_estimators = 150,max_depth=60,min_impurity_decrease = 0.0).fit(x2_train, y2_train) pre_rf2 = rf2.predict(x2_test) for i in range(pre_reg2.size): if pre_reg2[i]<1: pre_reg2[i] = 1 if pre_reg2_poly[i]<1: pre_reg2_poly[i] = 1 if pre_rf2[i]<1: pre_rf2[i] = 1 predict2.append(pre_reg2) predict2.append(pre_reg2_poly) predict2.append(pre_rf2) x2_final = x2_test.copy() x2_final['Output'] = y2_test x2_final['Lin_reg'] = pre_reg2 x2_final['Lin_reg_poly'] = pre_reg2_poly x2_final['RF_reg'] = pre_rf2 x2_final['Resid'] = y2_test-pre_reg2 x2_final['Resid_poly'] = y2_test-pre_reg2_poly for prediction in predict2: print(np.sqrt(mean_squared_log_error( y2_test, prediction )))",Yes,3,7.0 "from plotly.subplots import make_subplots name1 = ['Residual for casual without polynomial features'] *1633 name2 = ['Residual for casual with polynomial features'] *1633 name3 = ['Residual for registered without polynomial features'] *1633 name4 = ['Residual for registered with polynomial features'] *1633 dic = {'Lin_reg': 'Predicted Output','Resid':'Deviation from predicted','Output':'Expected Output','Lin_reg_poly': 'Predicted Output', 'Resid_poly':'Deviation from predicted'} fig1 = px.scatter(data_frame = x1_final,x = 'Lin_reg', y = 'Resid',hover_data = ['Output'],labels = dic,hover_name = name1, color_discrete_sequence = ['red']) fig2 = px.scatter(data_frame = x1_final,x = 'Lin_reg_poly', y = 'Resid_poly',hover_data = ['Output'],labels = dic,hover_name = name2, color_discrete_sequence = ['blue']) fig3 = px.scatter(data_frame = x2_final,x = 'Lin_reg', y = 'Resid',hover_data = ['Output'],labels = dic,hover_name = name3, color_discrete_sequence = ['darkgreen']) fig4 = px.scatter(data_frame = x2_final,x = 'Lin_reg_poly', y = 'Resid_poly',hover_data = ['Output'],labels = dic,hover_name = name4, color_discrete_sequence = ['gold']) trace1 = fig1['data'][0] trace2 = fig2['data'][0] trace3 = fig3['data'][0] trace4 = fig4['data'][0] fig = make_subplots(rows=2, cols=2,horizontal_spacing =0.1,vertical_spacing = 0.2, row_titles = ['Using Polynomial','Without Polynomial'],column_titles = ['Casual','Registered'], x_title = 'Residual plots for Registered and Casual under different models (Hover for more details)') fig.add_trace(trace1, row=1, col=1) fig.add_trace(trace2, row=1, col=2) fig.add_trace(trace3, row=2, col=1) fig.add_trace(trace4, row=2, col=2) fig.show()",No,5,56.0 "rf1 = RandomForestRegressor(n_estimators = 200,max_depth=80,min_impurity_decrease = 0.001).fit(df_train_x,df_cas_train_y) rf2 = RandomForestRegressor(n_estimators = 150,max_depth=60,min_impurity_decrease = 0.0).fit(df_train_x,df_reg_train_y)",No,5,7.0 test_file = pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv'),No,5,45.0 "test=test_file test.describe()",No,5,40.0 "test['mth'] = pd.DatetimeIndex(test['datetime']).month test['yr'] = pd.DatetimeIndex(test['datetime']).year test['dy'] = pd.DatetimeIndex(test['datetime']).day test['hr'] = pd.DatetimeIndex(test['datetime']).hour for i in test.groupby(""season"").count().index: s = 's' + str(i) a = [] for j in test.season: if j==i: a.append(1) else: a.append(0) test[s] = a for i in test.groupby(""weather"").count().index: s = 'w' + str(i) a = [] for j in test.weather: if j==i: a.append(1) else: a.append(0) test[s] = a for i in test.groupby('hr').count().index: s = 'hr'+str(i) a=[] for j in test.hr: if j==i: a.append(1) else: a.append(0) test[s]=a for i in test.groupby(""mth"").count().index: s = 'm' + str(i) a = [] for j in test.mth: if j==i: a.append(1) else: a.append(0) test[s] = a test.sample(10)'",No,5,8.0 "pre_cas = rf1.predict(test) pre_reg = rf2.predict(test) final_predictions = pd.DataFrame(pre_cas+pre_reg,columns = ['cout']) final_predictions.describe",Yes,4,48.0 "final_predictions['datetime']=test_file['datetime'] final_predictions = final_predictions[['datetime','count']]",No,5,55.0 "#Importing libraries import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy.stats import chi2_contingency from scipy.stats import spearmanr %matplotlib inline import itertools import os import calendar from datetime import datetime from scipy import stats from scipy.special import inv_boxcox from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split as split for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) ",Yes,5,88.0 "def change_to_categorical(data, max_cat=10): df = data.copy() for col in df.columns: if df[col].dtype == object or str(df[col].dtype).startswith(('int','float')): count = len(df[col].unique()) if count <= max_cat: df[col] = df[col].astype('category') return df",No,4,16.0 "def get_count_plot(x, df, ax, y=None, value_counts = None, print_percent = False): if value_counts is None: counts = df[x].value_counts().sort_index() else: counts = df[value_counts] #counts.plot.bar() #sns bars are just more colorful :P if y is None: sns.countplot(x, data=df, ax=ax) else: sns.barplot(x, y, data=df, ax=ax) if print_percent: print_percent_count_plot(counts, ax)",No,4,33.0 "def get_count_plot_for_categorical(df, n_cols = 2, y='cnt', list_cat=None, value_counts=None, print_percent=False): if list_cat is None: num_col, cat_col = get_numerical_and_categorical_col(df) else: cat_col = list_cat f, axs, n_rows = get_fig_and_axis_for_subplots(len(cat_col), n_cols) for i, col in enumerate(cat_col): ax = plt.subplot(n_rows, n_cols, i+1) get_count_plot(col, df, ax, y, value_counts, print_percent)",No,5,33.0 "def get_target_dist_with_categorical(df, n_cols = 2, y='cnt', list_cat=None, plot_type = 'box'): if list_cat is None: num_col, cat_col = get_numerical_and_categorical_col(df) else: cat_col = list_cat f, axs, n_rows = get_fig_and_axis_for_subplots(len(cat_col), n_cols) for i, col in enumerate(cat_col): ax = plt.subplot(n_rows, n_cols, i+1) if plot_type == 'box': sns.boxplot(x=col, data=df,y=y,orient=""v"",ax=ax) else: sns.violinplot(col, data=df,y=y,orient=""v"",ax=ax)'",No,5,33.0 "def get_plot_for_numerical(df, n_cols = 2, plot_type='probability',list_col=None, hist=True, kde=True): if list_col is None: num_col, cat_col = get_numerical_and_categorical_col(df) else: num_col = list_col f, axs, n_rows = get_fig_and_axis_for_subplots(len(num_col), n_cols) for i, col in enumerate(num_col): ax = plt.subplot(n_rows, n_cols, i+1) if plot_type == 'probability': sns.distplot(df[col], hist=hist, kde=hist, color = 'darkblue', hist_kws={'edgecolor':'black'}, kde_kws={'linewidth': 4}) elif plot_type == 'box': sns.boxplot(data=df,y=col,orient=""v"",ax=ax) else: sns.violinplot(data=df,y=col,orient=""v"",ax=ax)'",No,5,33.0 "# visualize correlation matrix def visualize_corr_matrix(data): numerical_col, cat_col = get_numerical_and_categorical_col(data) df = data[numerical_col] corr = df.corr()# plot the heatmap #generating masks for upper triangle so that values are not repeated mask_ut=np.triu(np.ones(corr.shape)).astype(np.bool) sns.heatmap(corr, mask=mask_ut, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))",No,5,80.0 "def remove_outliers_for_variable_by_quantiles(data, col, q1=.25, q2=.75): df = data.copy() median = df[col].median() q25, q75 = df[col].quantile([q1,q2]) iqr = q75-q25 upper_wh = q75 +1.5*iqr lower_wh = q25 - 1.5*iqr whiskers = int(np.floor(lower_wh)), int(np.ceil(upper_wh)) df.drop(df[~df[col].between(whiskers[0], whiskers[1]) & (~np.isnan(df[col]))].index, inplace=True) return df",No,5,8.0 "def remove_outliers_for_variable_by_std(data, col): df = data.copy() df = df[np.abs(df[col]-df[col].mean())<=(3*df[col].std())] return df",No,5,14.0 "#loop for chi square values def calculate_chi_square_values(df, alpha=.05): chi2_dict = {} numerical_col, cat_col = get_numerical_and_categorical_col(df) for i in cat_col: for j in cat_col: if i!=j and (j+' '+i) not in chi2_dict.keys(): chi2, p, dof, ex = chi2_contingency(pd.crosstab(df[i], df[j])) chi2_dict[i+' '+j] = 'Independent? '+ str(p>alpha) return chi2_dict",No,5,47.0 "def rmsle(y, y_,convertExp=True): if convertExp: y = inv_boxcox(y, fitted_lambda), y_ = inv_boxcox(y_, fitted_lambda) log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y])) log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_])) calc = (log1 - log2) ** 2 return np.sqrt(np.mean(calc))",No,5,84.0 "def plot_prediction(test, test_pred, train, train_pred, convert_to_original_form = False): if convert_to_original_form: test = inv_boxcox(test, fitted_lambda), test_pred = inv_boxcox(test_pred, fitted_lambda) train = inv_boxcox(train, fitted_lambda), train_pred = inv_boxcox(train_pred, fitted_lambda) f, ax = plt.subplots(1,2, figsize=(10, 5)) ax1 = plt.subplot(1,2,1) sns.distplot(test, hist=True, kde=True, color = 'darkblue', hist_kws={'edgecolor':'black'}, kde_kws={'linewidth': 4}, ax = ax1) sns.distplot(test_pred, hist=True, kde=True, color = 'red', hist_kws={'edgecolor':'red'}, kde_kws={'linewidth': 4}, ax = ax1) ax1.set_title(""Actual vs Predicted (Test)"") ax2 = plt.subplot(1,2,2) sns.distplot(train, hist=True, kde=True, color = 'darkblue', hist_kws={'edgecolor':'black'}, kde_kws={'linewidth': 4}, ax = ax2) sns.distplot(train_pred, hist=True, kde=True, color = 'red', hist_kws={'edgecolor':'red'}, kde_kws={'linewidth': 4}, ax = ax2) ax2.set_title(""Actual vs Predicted (Train)"")'",No,5,56.0 "df_hour = pd.read_csv(""/kaggle/input/bike-sharing-demand/train.csv"") df_test = pd.read_csv(""/kaggle/input/bike-sharing-demand/test.csv"")",No,5,45.0 df_hour.head(),No,5,41.0 "df_hour[""dteday""] = df_hour.datetime.apply(lambda x : x.split()[0]) df_hour[""yr""] = df_hour.datetime.apply(lambda x : x.split()[0][:4]) df_hour['yr'] = df_hour.yr.map({'2011': 0, '2012':1}) df_hour[""hr""] = df_hour.datetime.apply(lambda x : x.split()[1].split("":"")[0]) df_hour[""weekday""] = df_hour.dteday.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,""%Y-%m-%d"").weekday()]) df_hour.weekday = df_hour.weekday.map({'Saturday':6, 'Sunday':0, 'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 'Friday':5}) df_hour[""mnth""] = df_hour.dteday.apply(lambda dateString : calendar.month_name[datetime.strptime(dateString,""%Y-%m-%d"").month]) df_hour.mnth = df_hour.mnth.map({'January':0, 'February':1, 'March':2, 'April':3, 'May':4, 'June':5, 'July':6, 'August':7, 'September':8, 'October':9, 'November':10, 'December':11}) df_hour[""weathersit""] = df_hour.weather df_hour['dteday'] = pd.to_datetime(df_hour['dteday']) del df_hour['weather'] del df_hour['datetime']'",Yes,4,8.0 "#performing same on test #performing same on test df_test[""dteday""] = df_test.datetime.apply(lambda x : x.split()[0]) df_test[""yr""] = df_test.datetime.apply(lambda x : x.split()[0][:4]) df_test['yr'] = df_test.yr.map({'2011': 0, '2012':1}) df_test[""hr""] = df_test.datetime.apply(lambda x : x.split()[1].split("":"")[0]) df_test[""weekday""] = df_test.dteday.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,""%Y-%m-%d"").weekday()]) df_test.weekday = df_test.weekday.map({'Saturday':6, 'Sunday':0, 'Monday':1, 'Tuesday':2, 'Wednesday':3, 'Thursday':4, 'Friday':5}) df_test[""mnth""] = df_test.dteday.apply(lambda dateString : calendar.month_name[datetime.strptime(dateString,""%Y-%m-%d"").month]) df_test.mnth = df_test.mnth.map({'January':0, 'February':1, 'March':2, 'April':3, 'May':4, 'June':5, 'July':6, 'August':7, 'September':8, 'October':9, 'November':10, 'December':11}) df_test[""weathersit""] = df_test.weather df_test['dteday'] = pd.to_datetime(df_test['dteday']) del df_test['weather']'",Yes,5,8.0 "df_hour['cnt'] = df_hour['count'] del df_hour['count'] df_hour.shape",Yes,4,61.0 df_hour.info(),No,5,40.0 df_hour.columns,No,5,71.0 "#Creating a copy of df to preserve original df df = df_hour.copy()",No,5,12.0 "#Analysing which of these are categorical variables for col in df.columns: print('Count of unique values for ', col, ': ', len(df[col].unique()))",No,5,54.0 "'''We know maximum categories for any categorical col is 24 (for month) Hence we can use this to def function to convert variable to categorical type''' df = change_to_categorical(df, max_cat=24)",No,5,16.0 "dataTypeDf = (df.dtypes.astype(str).value_counts()).reset_index().rename(columns={""index"":""variableType"",0:""count""}) fig,ax = plt.subplots() fig.set_size_inches(12,5) get_count_plot('variableType',dataTypeDf, ax, 'count', value_counts='count', print_percent=True)'",No,3,33.0 df[get_numerical_and_categorical_col(df)[0]].describe(),No,5,40.0 df[get_numerical_and_categorical_col(df)[1]].describe(include='all'),No,5,40.0 "get_plot_for_numerical(df,3)",No,3,33.0 "plt.figure(figsize=(7,5)) visualize_corr_matrix(df)",No,3,80.0 "'''Let's start creating a list which contains all the variables to be deleted. We can delete them once we're done with our exploratory analysis''' cols_to_remove = ['registered','casual','windspeed'] #besides atemp should be deleted immediately for obvious reasons! del df['atemp'] del df_test['atemp']",No,5,10.0 "get_plot_for_numerical(df, 3, plot_type='box')",No,5,33.0 "get_target_dist_with_categorical(df,n_cols=3)",No,5,53.0 "get_target_dist_with_categorical(df, n_cols=2, plot_type='violin')",No,5,33.0 "#Let's perform categorical test chi2 to decide which categorical columns to delete chi2_dict = calculate_chi_square_values(df) chi2_dict",No,5,47.0 "sns.pointplot(x='hr',y='cnt',data=df, hue='season', markers = 'x')",No,5,33.0 "sns.pointplot(x='hr',y='cnt',data=df, hue='weekday', markers = 'x')",No,5,33.0 "#to visualize similar plot for type of user, we would need to use melt #what melt would do, take each hour and generate rows for value variables. Next we'll use this to find mean for each hour and for each type of users hr_users_type = pd.melt(df[[""hr"",""casual"",""registered""]], id_vars=['hr'], value_vars=['casual', 'registered']).sort_values(by='hr') hr_users_type.head()'",Yes,2,8.0 "hr_users_type_mean = pd.DataFrame(hr_users_type.groupby([""hr"",""variable""],sort=True)[""value""].mean()).reset_index() hr_users_type_mean.head()",Yes,4,12.0 "sns.pointplot(x=hr_users_type_mean[""hr""], y=hr_users_type_mean[""value""],hue=hr_users_type_mean[""variable""],hue_order=[""casual"",""registered""], data=hr_users_type_mean, join=True)",No,5,33.0 "from sklearn.ensemble import RandomForestRegressor np.random.seed(42) # drop target columns df_original = df.copy() drop_cols=['cnt', 'dteday','registered','casual'] X = df.drop(drop_cols, axis = 1) # X = independent columns (potential predictors) y = df['cnt'] # y = target column (what we want to predict) # instantiate RandomForestClassifier rf_model = RandomForestRegressor() rf_model.fit(X,y) feat_importances = pd.Series(rf_model.feature_importances_, index=X.columns) # determine 20 most important features df_imp_feat = feat_importances.nlargest(20) df_imp_feat.plot(kind='bar') plt.show() print(df_imp_feat) print('Comparing with our columns') print(cols_to_remove)",Yes,3,7.0 "df_cleaned = df.copy() df_cleaned.drop(cols_to_remove, axis=1, inplace=True) df_test.drop(cols_to_remove, axis=1, inplace=True, errors='ignore') df_cleaned.head()",Yes,5,10.0 df_cleaned.describe(),No,5,40.0 "df_cleaned.drop(['cnt','cnt_log','box_cox_reverse'], axis=1, inplace=True) df_cleaned.rename(columns={'cnt_box_cox':'count_transformed'}, inplace=True)",Yes,4,10.0 sc = StandardScaler(),No,2,18.0 "target = 'count_transformed' X = df_cleaned.drop(target, axis=1) y = df_cleaned[target] seed=23 X_train, X_test, y_train, y_test = split(X, y, test_size=.3, random_state=seed)",Yes,5,13.0 "X_train = sc.fit_transform(X_train) X_test = sc.fit_transform(X_test)",No,4,18.0 "from sklearn.linear_model import LinearRegression,Ridge,Lasso from sklearn.model_selection import GridSearchCV, RandomizedSearchCV from sklearn.metrics import mean_squared_error as mse from sklearn import metrics # Initialize logistic regression model lr = LinearRegression() # Train the model lr.fit(X_train,y = y_train) # Make predictions y_pred = lr.predict(X_test) y_pred_train = lr.predict(X_train) print('RMSLE for test: ',rmsle(y_test, y_pred, True)) print('RMSLE for train: ',rmsle(y_train, y_pred_train, True))",Yes,3,7.0 "coeff_df = pd.DataFrame(lr.coef_, X.columns, columns=['Coefficient']) coeff_df",No,4,79.0 "plot_prediction(y_test, y_pred, y_train, y_pred_train, True)",No,4,56.0 "ridge = Ridge() ridge_param = {'max_iter':[3000], 'alpha':[.1,.03,.3,1,3,10, 30, 100,300]} rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False) grid_ridge = GridSearchCV(ridge, ridge_param, scoring = rmsle_scorer, cv = 10) grid_ridge.fit(X_train, y_train) y_pred_ridge = grid_ridge.predict(X_test) y_pred_ridge_train = grid_ridge.predict(X_train) print('Grid Ridge Best Params: ', grid_ridge.best_params_) print('RMSLE for test: ',rmsle(y_test, y_pred_ridge, True)) print('RMSLE for train: ',rmsle(y_train, y_pred_ridge_train, True))",Yes,3,6.0 "fig,ax= plt.subplots() fig.set_size_inches(12,5) df = pd.DataFrame(grid_ridge.cv_results_) df[""alpha""] = df[""params""].apply(lambda x:x[""alpha""]) df[""rmsle""] = df[""mean_test_score""].apply(lambda x:-x) sns.pointplot(data=df,x=""alpha"",y=""rmsle"",ax=ax)",No,5,81.0 "plot_prediction(y_test, y_pred_ridge, y_train, y_pred_ridge_train, True)",No,4,56.0 "lasso = Lasso() alpha = 1/np.array([.1,.03,.3,1,3,10, 30, 100,300,1000]) lasso_param = {'max_iter':[3000], 'alpha':alpha} rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False) random_lasso = RandomizedSearchCV(lasso, lasso_param, scoring = rmsle_scorer, cv = 10) random_lasso.fit(X_train, y_train) y_pred_lasso = random_lasso.predict(X_test) y_pred_lasso_train = random_lasso.predict(X_train) print('Random Lasso Best Params: ', random_lasso.best_params_) print('RMSLE for test: ',rmsle(y_test, y_pred_lasso, True)) print('RMSLE for train: ',rmsle(y_train, y_pred_lasso_train, True))",No,3,6.0 "fig,ax= plt.subplots() fig.set_size_inches(12,5) df = pd.DataFrame(random_lasso.cv_results_) df[""alpha""] = df[""params""].apply(lambda x:x[""alpha""]) df[""rmsle""] = df[""mean_test_score""].apply(lambda x:-x) sns.pointplot(data=df,x=""alpha"",y=""rmsle"",ax=ax)",No,5,84.0 "plot_prediction(y_test, y_pred_lasso, y_train, y_pred_lasso_train, True)",No,4,56.0 "from sklearn.tree import DecisionTreeRegressor as dt dt_m = dt(random_state=0) dt_m.fit(X_train,y_train) y_pred_dt=dt_m.predict(X_test) y_pred_dt_train=dt_m.predict(X_train) print('RMSLE for test: ',rmsle(y_test, y_pred_dt, True)) print('RMSLE for train: ',rmsle(y_train, y_pred_dt_train, True))",Yes,4,7.0 "plot_prediction(y_test, y_pred_dt, y_train, y_pred_dt_train, True)",No,4,56.0 "from sklearn.ensemble import RandomForestRegressor as rfr rf = rfr(n_estimators=100) rf.fit(X_train, y_train) y_pred_rf = rf.predict(X_test) y_pred_rf_train = rf.predict(X_train) print('RMSLE for test: ',rmsle(y_test, y_pred_rf, True)) print('RMSLE for train: ',rmsle(y_train, y_pred_rf_train, True))",Yes,3,7.0 "plot_prediction(y_test, y_pred_rf, y_train, y_pred_rf_train, True)",No,4,56.0 "from sklearn.ensemble import GradientBoostingRegressor gbm = GradientBoostingRegressor(n_estimators=3000,alpha=.03) gbm.fit(X_train,y_train) y_pred_gbm = gbm.predict(X_test) y_pred_gbm_train = gbm.predict(X_train) print('RMSLE for test: ',rmsle(y_test, y_pred_gbm, True)) print('RMSLE for train: ',rmsle(y_train, y_pred_gbm_train, True))",Yes,3,7.0 "plot_prediction(y_test, y_pred_gbm, y_train, y_pred_gbm_train, True)",No,4,56.0 "df_test = df_test.sort_values(by='datetime') datetime_series = df_test.datetime df_test_for_model = df_test.copy() df_test_for_model.drop(['datetime'], inplace=True, axis=1) X_test_ndarry = df_test_for_model.to_numpy() final_X_test = sc.fit_transform(X_test_ndarry) final_y_pred = inv_boxcox(gbm.predict(final_X_test), fitted_lambda)",Yes,3,21.0 "final_y_pred.shape, datetime_series.shape",No,5,58.0 "submission = pd.DataFrame({'datetime':datetime_series, 'count':np.round(final_y_pred)})",No,5,55.0 "submission_rf = pd.DataFrame({'datetime':datetime_series, 'count':final_y_pred_rf})",No,5,12.0 "sns.distplot(final_y_pred, hist=True, kde=True, color = 'darkblue', hist_kws={'edgecolor':'black'}, kde_kws={'linewidth': 4})",No,5,33.0 "#Just to avoid any missing values submission[submission['count'].isna()]",No,4,39.0 "#let's see values around it to fill these #intuition knn submission.iloc[721],submission.iloc[720],submission.iloc[719]",No,4,41.0 "#By this and also by our initial analysis, afternoon is not a preferable time to ride bike submission.fillna(0, inplace=True)",No,5,17.0 "submission.iloc[725],submission.iloc[726],submission.iloc[727]",No,4,41.0 "submission.to_csv('bike_predictions_rounded.csv', index=False) submission.to_csv('bike_predictions_random_forest.csv', index=False)",No,5,25.0 "import pandas as pd import calendar import numpy as np import seaborn as sns import matplotlib.pyplot as plt from scipy.special import boxcox, inv_boxcox from datetime import datetime from numpy import arange from pandas import read_csv from sklearn.linear_model import Ridge,Lasso from sklearn.model_selection import RepeatedKFold from sklearn.model_selection import GridSearchCV from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor train_df=pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv') train_df.head(6)",Yes,4,45.0 "#Fig Will show count distribution sns.distplot(train_df['count']) plt.show() ",No,5,33.0 "#Fig Will show count distribution post log transformation train_df['count']=train_df['count'].apply(lambda x:np.log(x)) sns.distplot(train_df['count']) plt.show() print (train_df['count']) train_df.shape",Yes,4,33.0 "#Visualize Count wrt categorical variables cat_names=['season', 'holiday', 'workingday', 'weather'] i=0 for name in cat_names: i=i+1 plt.subplot(2,2,i) sns.countplot(name,data=train_df) plt.show()",No,5,33.0 "#Visualize data wrt continous variables. cont_names=['temp','atemp','humidity','windspeed'] i=0 for name in cont_names: i=i+1 plt.subplot(2,2,i) sns.boxplot(name,data=train_df) plt.show() #Windspeed seems to be skewed",No,5,33.0 "#Splitting out Datetime attribute in dataframe and dropping unwanted variables as per before analysis new_df=train_df.copy(deep=True) new_df['day']=new_df['datetime'].apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,""%Y-%m-%d %H:%M:%S"").weekday()]) new_df['datetime'] = pd.to_datetime(new_df['datetime'], format='%Y-%m-%d %H:%M:%S') new_df['month']=new_df['datetime'].apply(lambda x:x.month) new_df['hour']=new_df['datetime'].apply(lambda x:x.hour) new_df['year']=new_df['datetime'].apply(lambda x:x.year) final_df=new_df.copy(deep=True) final_df=new_df.drop(['datetime','temp','casual','registered'], axis=1) final_df.head()'",Yes,3,16.0 "#adding dummy varibles to categorical variables dropping the souce columns weather_df=pd.get_dummies(final_df['weather'],prefix='w',drop_first=True) yr_df=pd.get_dummies(final_df['year'],prefix='y',drop_first=True) month_df=pd.get_dummies(final_df['month'],prefix='m',drop_first=True) hour_df=pd.get_dummies(final_df['hour'],prefix='h',drop_first=True) season_df=pd.get_dummies(final_df['season'],prefix='s',drop_first=True) day_df=pd.get_dummies(final_df['day'],prefix='d',drop_first=True) final_df=final_df.drop(['weather','year','month','hour','season','day'], axis=1) final_df=final_df.join(weather_df) final_df=final_df.join(yr_df) final_df=final_df.join(month_df) final_df=final_df.join(hour_df) final_df=final_df.join(season_df) final_df=final_df.join(day_df)",Yes,4,20.0 "print(final_df.columns.to_series().groupby(final_df.dtypes).groups) final_df.head(5)",Yes,4,41.0 "#Initializing training set X=final_df.iloc[:,final_df.columns!='count'].values Y=final_df.iloc[:,5].values",No,5,21.0 "#Ridge Regression Implementation 10 Folds # define model model = Ridge() # define model evaluation method cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) # define grid grid = dict() grid['alpha'] = arange(0, 1, 0.01) # define search search = GridSearchCV(model, grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1) # perform the search results = search.fit(X, Y) # summarize print('MSE (NEGATIVE): %.3f' % results.best_score_) print('Config: %s' % results.best_params_)",Yes,5,6.0 "#Lasso Regression Implementation 10 Folds # define model model = Lasso() # define model evaluation method cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) # define grid grid = dict() grid['alpha'] = arange(0, 1, 0.01) # define search search = GridSearchCV(model, grid, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1) # perform the search results = search.fit(X, Y) # summarize print('MSE (NEGATIVE): %.3f' % results.best_score_) print('Config: %s' % results.best_params_)",Yes,5,6.0 "#Decision TreeImplementation 10 Folds cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) dtm = DecisionTreeRegressor(random_state=42) param_grid = {""criterion"": [""mse"", ""mae""], } search = GridSearchCV(dtm,param_grid, scoring='neg_mean_squared_error', cv=cv) results = search.fit(X, Y) # summarize print('MSE (NEGATIVE): %.3f' % results.best_score_) print('Config: %s' % results.best_params_)'",Yes,4,6.0 "#Decision Tree with Pruning with 10 Folds cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) dtm = DecisionTreeRegressor(random_state=42) param_grid = {""criterion"": [""mse"", ""mae""], ""max_depth"": [2, 6, 8], } search = GridSearchCV(dtm,param_grid, scoring='neg_mean_squared_error', cv=cv) results = search.fit(X, Y) # summarize print('MSE (NEGATIVE): %.3f' % results.best_score_) print('Config: %s' % results.best_params_)'",Yes,5,6.0 "grid = dict() grid['n_estimators'] = [1000] grid['max_depth'] = [125,150,175] cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) model = RandomForestRegressor() search = GridSearchCV(model,param_grid=grid, scoring='neg_mean_squared_error', cv=cv) results = search.fit(X, Y) # summarize print('MSE (NEGATIVE): %.3f' % results.best_score_) print('Config: %s' % results.best_params_)",Yes,4,6.0 "def grid_search(): from sklearn.ensemble import GradientBoostingRegressor print ('lets go') model = GradientBoostingRegressor() # define the grid of values to search grid = dict() grid['n_estimators'] = [4000] grid['learning_rate'] = [ 0.001, 0.01, 0.1] grid['max_depth'] = [4] cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) # define the grid search procedure grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='neg_mean_squared_error') grid_search=grid_search.fit(X,Y) best_accuracy=grid_search.best_score_ best_parameters=grid_search.best_params_ print (best_accuracy) print (best_parameters) grid_search()",Yes,5,6.0 "from sklearn.ensemble import GradientBoostingRegressor rgr=GradientBoostingRegressor(learning_rate=0.1,n_estimators=4000, max_depth=4) rgr.fit(X,Y)",Yes,5,7.0 "test_df=pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv') test_df['day']=test_df['datetime'].apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,""%Y-%m-%d %H:%M:%S"").weekday()]) test_df['datetime']=pd.to_datetime(test_df['datetime'], format='%Y-%m-%d %H:%M:%S') test_df['month']=test_df['datetime'].apply(lambda x:x.month) test_df['hour']=test_df['datetime'].apply(lambda x:x.hour) test_df['year']=test_df['datetime'].apply(lambda x:x.year) test_df=test_df.drop(['datetime','temp'], axis=1) #adding dummy varibles to categorical variables weather_df=pd.get_dummies(test_df['weather'],prefix='w',drop_first=True) yr_df=pd.get_dummies(test_df['year'],prefix='y',drop_first=True) month_df=pd.get_dummies(test_df['month'],prefix='m',drop_first=True) hour_df=pd.get_dummies(test_df['hour'],prefix='h',drop_first=True) season_df=pd.get_dummies(test_df['season'],prefix='s',drop_first=True) day_df=pd.get_dummies(test_df['day'],prefix='d',drop_first=True) test_df=test_df.drop(['weather','year','month','hour','season','day'], axis=1) test_df=test_df.join(weather_df) test_df=test_df.join(yr_df) test_df=test_df.join(month_df) test_df=test_df.join(hour_df) test_df=test_df.join(season_df) test_df=test_df.join(day_df)'",Yes,3,16.0 "temp=pd.read_csv('../input/bike-sharing-demand/sampleSubmission.csv') X_test=test_df.iloc[:,:].values y_output=rgr.predict(X_test) y_output op=pd.DataFrame({'count':np.exp(y_output)}) op['datetime']=temp['datetime'] op.to_csv('finalSubmission.csv', index=False)",Yes,3,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load # import numpy as np # linear algebra # import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the read-only ""../input/"" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,5,88.0 "import pandas as pd import numpy as np import seaborn as sns from scipy import stats from datetime import datetime import matplotlib.pyplot as plt %matplotlib inline",No,5,23.0 "train = pd.read_csv('../input/bike-sharing-demand/train.csv') test = pd.read_csv('../input/bike-sharing-demand/test.csv')",No,5,45.0 "tt = train.append(test) tt = tt.reset_index().drop('index', axis=1) tt.head()",Yes,3,11.0 sns.distplot(train['count']),No,5,33.0 "# Time processing # Add two columns, date and hour respectively temp = pd.DatetimeIndex(train['datetime']) train['year'] = temp.year train['date'] = temp.date train['hour'] = temp.hour # Categorical variables for the day of the week train['dayofweek'] = pd.DatetimeIndex(train.date).dayofweek",No,5,8.0 "# The impact of each time period of the day on count sns.boxplot(train['hour'], train['count'])",No,5,75.0 "# The influence of the total days of the week on count sns.boxplot(train['dayofweek'], train['count'])",No,5,75.0 "# Changes in count for each day of the week sns.pointplot(x='hour', y='count', hue='dayofweek', data=train)",No,5,75.0 "# The impact of different months on count train['month'] = pd.to_datetime(train['datetime']).dt.month sns.boxplot(train['month'], train['count'])",Yes,5,81.0 "# The impact of holidays on count sns.pointplot(x='hour', y='count',hue='workingday', data=train)",No,5,75.0 "# The impact of weather on count sns.pointplot(x='hour', y='count', hue='weather', data=train)",No,5,75.0 "# The influence of season on count sns.pointplot(x='hour', y='count', hue='season', data=train)",No,5,75.0 "# Pearson coefficient cor=train[['temp', 'atemp', 'casual', 'registered', 'humidity','windspeed', 'count']].corr() sns.heatmap(cor, square=True, annot=True)",No,5,80.0 "temp = pd.DatetimeIndex(tt['datetime']) tt['year'] = temp.year tt['hour'] = temp.hour tt = tt[['hour', 'year', 'workingday', 'holiday', 'season', 'weather', 'atemp', 'count']] # One-hot coding for discrete variables, such as color red, yellow, and blue coding as [[1,0,0], [0,1,0], [0,0,1]] tt = pd.get_dummies(tt, columns=['hour'], prefix=['hour'], drop_first=True) tt = pd.get_dummies(tt, columns=['year'], prefix=['year'], drop_first=True) tt = pd.get_dummies(tt, columns=['season'], prefix=['season'], drop_first=True) tt = pd.get_dummies(tt, columns=['weather'], prefix=['weather'], drop_first=True) tt.head()",Yes,3,20.0 "# Extract the training set and test set from the processed data set, [0:10886] and [10886:] new_train = tt.iloc[:10886, :] # Pair count+1, then take the logarithm y = np.log1p(new_train['count']) new_test = tt.iloc[10886:, :].drop('count',axis=1) new_train.drop('count', axis=1, inplace=True) x = new_train x.head()",Yes,5,10.0 "from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from sklearn.model_selection import cross_val_score x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=3)",Yes,5,13.0 "from sklearn.linear_model import LinearRegression lmodel = LinearRegression() lmodel.fit(x, y) cross_val_score(lmodel, x, y, cv=5).mean()",Yes,4,7.0 "lmodel.fit(x_train, y_train) pre = lmodel.predict(x_test) mean_squared_error(y_test, pre)",Yes,3,7.0 "from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import GridSearchCV rfr = RandomForestRegressor(random_state=50, max_features='sqrt', oob_score=True)",Yes,5,4.0 "# Parameter tuning - This step requires a lot of calculation para = {'n_estimators': np.arange(200, 241, 1)} rf = GridSearchCV(estimator=rfr, param_grid=para, cv=5) rf.fit(x, y)",Yes,5,6.0 rf.best_params_,No,5,2.0 "rfr = RandomForestRegressor(n_estimators=227, random_state=50, max_features='sqrt',oob_score=True) cross_val_score(rfr, x, y, cv=5).mean()",No,4,28.0 "rfr.fit(x_train, y_train) pre = rfr.predict(x_test) mean_squared_error(y_test, pre)",Yes,3,7.0 "rfr.fit(x,y)",No,5,7.0 "co = rfr.predict(new_test) m = [] # Decrease the result by one and round up for i in (np.exp(co) - 1): n = round(i) m.append(n) predict = pd.DataFrame({'datetime': test['datetime'], 'count': m}) predict.to_csv('rfr.csv', index=False) ",Yes,4,25.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt from pandas.plotting import register_matplotlib_converters register_matplotlib_converters()",Yes,5,23.0 "# Example # Converts to log1p(count) # Print original count back using expm1 print('Test log and exp') test_count = 100 print('original value', test_count) x = np.log1p(test_count) # log (x+1) print('log1p', x) print('expm1', np.expm1(x)) # exp(x) - 1",No,3,8.0 "df = pd.read_csv('../input/bike-sharing-demand/train.csv',parse_dates=['datetime'],index_col=0) df_test = pd.read_csv('../input/bike-sharing-demand/test.csv',parse_dates=['datetime'],index_col=0) ",No,5,45.0 "# We need to convert datetime to numeric for training. # Let's extract key features into separate numeric columns def add_features(df): df['year'] = df.index.year df['month'] = df.index.month df['day'] = df.index.day df['dayofweek'] = df.index.dayofweek df['hour'] = df.index.hour",No,5,8.0 "# Need to predict the missing data plt.title('Rental Count - Gaps') df['2011-01':'2011-02']['count'].plot() plt.show()",No,5,81.0 "# Rentals change hourly! plt.plot(df['2011-01-01']['count']) plt.xticks(fontsize=14, rotation=45) plt.xlabel('Date') plt.ylabel('Rental Count') plt.title('Hourly Rentals for Jan 01, 2011') plt.show()",No,5,81.0 "# Seasonal plt.plot(df['2011-01']['count']) plt.xticks(fontsize=14, rotation=45) plt.xlabel('Date') plt.ylabel('Rental Count') plt.title('Jan 2011 Rentals (1 month)') plt.show()",No,5,75.0 "group_hour = df.groupby(['hour']) average_by_hour = group_hour['count'].mean()",Yes,5,60.0 "plt.plot(average_by_hour.index,average_by_hour) plt.xlabel('Hour') plt.ylabel('Rental Count') plt.xticks(np.arange(24)) plt.grid(True) plt.title('Average Hourly Rental Count')",No,5,75.0 "# Year to year trend plt.plot(df['2011']['count'],label='2011') plt.plot(df['2012']['count'],label='2012') plt.xticks(fontsize=14, rotation=45) plt.xlabel('Date') plt.ylabel('Rental Count') plt.title('2011 and 2012 Rentals (Year to Year)') plt.legend() plt.show()",No,5,75.0 "plt.plot(df['2011']['count'].map(np.log1p),label='2011') plt.plot(df['2012']['count'].map(np.log1p),label='2012') plt.xticks(fontsize=14, rotation=45) plt.xlabel('Date') plt.ylabel('Log(Rental Count)') plt.title('2011 and 2012 Rentals (Year to Year)') plt.legend() plt.show()",No,5,75.0 "plt.boxplot([df['count']], labels=['count']) plt.title('Box Plot - Count') plt.ylabel('Target') plt.grid(True)",No,3,33.0 "# Let's see how the data distribution changes with log1p # Evenly distributed plt.boxplot([df['count'].map(np.log1p)], labels=['log1p(count)']) plt.title('Box Plot - log1p(Count)') plt.ylabel('Target') plt.grid(True)",No,3,33.0 "df[""count""] = df[""count""].map(np.log1p)",No,5,8.0 "group_year_month = df.groupby(['year','month'])",No,5,60.0 average_year_month = group_year_month['count'].mean(),No,2,8.0 average_year_month,No,5,41.0 "for year in average_year_month.index.levels[0]: plt.plot(average_year_month[year].index,average_year_month[year],label=year) plt.legend() plt.xlabel('Month') plt.ylabel('Count') plt.grid(True) plt.title('Average Monthly Rental Count for 2011, 2012') plt.show()",No,4,75.0 "group_year_hour = df.groupby(['year','hour']) average_year_hour = group_year_hour['count'].mean() for year in average_year_hour.index.levels[0]: #print (year) #print(average_year_month[year]) plt.plot(average_year_hour[year].index,average_year_hour[year],label=year) plt.legend() plt.xlabel('Hour') plt.ylabel('Count') plt.xticks(np.arange(24)) plt.grid(True) plt.title('Average Hourly Rental Count - 2011, 2012')",Yes,4,75.0 "group_workingday_hour = df.groupby(['workingday','hour']) average_workingday_hour = group_workingday_hour['count'].mean()",Yes,5,60.0 "for workingday in average_workingday_hour.index.levels[0]: #print (year) #print(average_year_month[year]) plt.plot(average_workingday_hour[workingday].index,average_workingday_hour[workingday], label=workingday) plt.legend() plt.xlabel('Hour') plt.ylabel('Count') plt.xticks(np.arange(24)) plt.grid(True) plt.title('Average Hourly Rental Count by Working Day') plt.show()",No,4,75.0 "# Let's look at correlation beween features and target df.corr()['count']",No,5,40.0 "# Any relation between temperature and rental count? plt.scatter(x=df.temp,y=df[""count""]) plt.grid(True) plt.xlabel('Temperature') plt.ylabel('Count') plt.title('Temperature vs Count') plt.show()'",No,5,33.0 "# Any relation between humidity and rental count? plt.scatter(x=df.humidity,y=df[""count""],label='Humidity') plt.grid(True) plt.xlabel('Humidity') plt.ylabel('Count') plt.title('Humidity vs Count') plt.show()'",No,5,33.0 "# Save all data df.to_csv('bike_all.csv',index=True,index_label='datetime',columns=columns)",No,5,25.0 "# Training = 70% of the data # Validation = 30% of the data # Randomize the datset np.random.seed(5) l = list(df.index) np.random.shuffle(l) df = df.loc[l]",No,5,15.0 "rows = df.shape[0] train = int(.7 * rows) test = rows-train",No,4,13.0 "rows, train, test",No,5,41.0 columns,No,5,71.0 "# Write Training Set df.iloc[:train].to_csv('bike_train.csv' ,index=False,header=False ,columns=columns)",No,5,25.0 "# Write Validation Set df.iloc[train:].to_csv('bike_validation.csv' ,index=False,header=False ,columns=columns)",No,5,25.0 "# Test Data has only input features df_test.to_csv('bike_test.csv',index=True,index_label='datetime')",No,5,25.0 "print(','.join(columns))",No,3,71.0 "# Write Column List with open('bike_train_column_list.txt','w') as f: f.write(','.join(columns))",No,5,84.0 "# Install xgboost in notebook instance. #### Command to install xgboost !pip install xgboost==0.90",No,5,87.0 "import sys import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.metrics import mean_squared_error, mean_absolute_error # XGBoost import xgboost as xgb",No,5,22.0 "column_list_file = 'bike_train_column_list.txt' train_file = 'bike_train.csv' validation_file = 'bike_validation.csv' test_file = 'bike_test.csv'",No,5,77.0 "columns = '' with open(column_list_file,'r') as f: columns = f.read().split(',')",No,5,88.0 "# Specify the column names as the file does not have column header df_train = pd.read_csv(train_file,names=columns) df_validation = pd.read_csv(validation_file,names=columns)",No,5,45.0 "X_train = df_train.iloc[:,1:] # Features: 1st column onwards y_train = df_train.iloc[:,0].ravel() # Target: 0th column X_validation = df_validation.iloc[:,1:] y_validation = df_validation.iloc[:,0].ravel()",No,5,21.0 "# XGBoost Training Parameter Reference: # https://github.com/dmlc/xgboost/blob/master/doc/parameter.md #regressor = xgb.XGBRegressor(max_depth=5,eta=0.1,subsample=0.7,num_round=150) regressor = xgb.XGBRegressor(max_depth=5,n_estimators=150)",No,5,4.0 regressor,No,2,4.0 "regressor.fit(X_train,y_train, eval_set = [(X_train, y_train), (X_validation, y_validation)])",No,5,7.0 eval_result = regressor.evals_result(),No,5,28.0 training_rounds = range(len(eval_result['validation_0']['rmse'])),No,3,77.0 "plt.scatter(x=training_rounds,y=eval_result['validation_0']['rmse'],label='Training Error') plt.scatter(x=training_rounds,y=eval_result['validation_1']['rmse'],label='Validation Error') plt.grid(True) plt.xlabel('Iteration') plt.ylabel('RMSE') plt.title('Training Vs Validation Error') plt.legend() plt.show()",No,4,35.0 "xgb.plot_importance(regressor) plt.show()",No,5,79.0 "# Verify Quality using Validation dataset # Compare actual vs predicted performance with dataset not seen by the model before df = pd.read_csv(validation_file,names=columns)",No,5,45.0 "X_test = df.iloc[:,1:] print(X_test[:5])",Yes,4,13.0 result = regressor.predict(X_test),No,5,48.0 result[:5],No,4,41.0 df['count_predicted'] = result,No,5,8.0 "# Negative Values are predicted df['count_predicted'].describe()",No,5,40.0 df[df['count_predicted'] < 0],No,5,14.0 "df['count_predicted'].hist() plt.title('Predicted Count Histogram') plt.show()",No,5,33.0 "def adjust_count(x): if x < 0: return 0 else: return x",No,3,8.0 df['count_predicted'] = df['count_predicted'].map(adjust_count),No,3,8.0 "df['count'] = df['count'].map(np.expm1) df['count_predicted'] = df['count_predicted'].map(np.expm1)",No,5,8.0 "# Actual Vs Predicted plt.plot(df['count'], label='Actual') plt.plot(df['count_predicted'],label='Predicted') plt.xlabel('Sample') plt.ylabel('Count') plt.xlim([100,150]) plt.title('Validation Dataset - Predicted Vs. Actual') plt.legend() plt.show()",No,4,56.0 "# Over prediction and Under Prediction needs to be balanced # Training Data Residuals residuals = (df['count'] - df['count_predicted']) plt.hist(residuals) plt.grid(True) plt.xlabel('Actual - Predicted') plt.ylabel('Count') plt.title('Residuals Distribution') plt.axvline(color='r') plt.show()",Yes,5,33.0 "print(""RMSE: {0:0.2f}"".format(mean_squared_error(df['count'],df['count_predicted'])**.5))'",No,5,49.0 "# RMSlE - Root Mean Squared Log Error # RMSLE Metric is used by Kaggle for this competition # RMSE Cost Function - Magnitude of difference matters # RMSLE cost function - ""Only Percentage difference matters"" # Reference:Katerina Malahova, Khor SoonHin # https://www.slideshare.net/KhorSoonHin/rmsle-cost-function def compute_rmsle(y_true, y_pred): if type(y_true) != np.ndarray: y_true = np.array(y_true) if type(y_pred) != np.ndarray: y_pred = np.array(y_pred) return(np.average((np.log1p(y_pred) - np.log1p(y_true))**2)**.5)",No,5,84.0 "print('RMSLE') print(compute_rmsle(100,50), compute_rmsle(1000,500), compute_rmsle(10000,5000))",No,2,49.0 "print('RMSLE') print(compute_rmsle(100,25), compute_rmsle(1000,250), compute_rmsle(10000,2500))",No,2,49.0 "print('RMSE') print(mean_squared_error([100],[50])**.5, mean_squared_error([1000],[500])**.5, mean_squared_error([10000],[5000])**.5)",No,2,49.0 "print('RMSE') print(mean_squared_error([100],[25])**.5, mean_squared_error([1000],[250])**.5, mean_squared_error([10000],[2500])**.5)",No,2,49.0 "print(""RMSLE: {0}"".format(compute_rmsle(df['count'],df['count_predicted'])))'",No,5,49.0 "# Prepare Data for Submission to Kaggle df_test = pd.read_csv(test_file,parse_dates=['datetime'])",No,5,45.0 "X_test = df_test.iloc[:,1:] # Exclude datetime for prediction",No,4,13.0 np.expm1(result),No,4,8.0 "# Convert result to actual count df_test[""count""] = np.expm1(result)",No,5,8.0 "df_test[df_test[""count""] < 0]",No,5,14.0 "df_test[['datetime','count']].to_csv('predicted_count_log.csv',index=False)",No,5,25.0 "submission.iloc[1258:1269, 1]= submission.iloc[1258:1269, 1]*0.5 submission.iloc[4492:4515, 1]= submission.iloc[4492:4515, 1]*0.5 # submission.iloc[6308:6330, 1]= submission.iloc[6308:6330, 1]*0.5 submission.iloc[3041:3063, 1]= submission.iloc[3041:3063, 1]*0.5 # submission.iloc[6332:6354, 1]= submission.iloc[6332:6354, 1]*0.5 submission.iloc[3065:3087, 1]= submission.iloc[3065:3087, 1]*0.5 # submission.iloc[5992:6015, 1]= submission.iloc[5992:6015, 1]*0.5 submission.iloc[2771:2794, 1]= submission.iloc[2771:2794, 1]*0.5'",No,5,14.0 "submission.drop(""holiday"",1,inplace=True) submission.to_csv(""allrf2.csv"", index=False)",No,5,25.0 "import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline",No,5,23.0 "train_df = pd.read_csv('../input/bike-sharing-demand/train.csv') test_df = pd.read_csv('../input/bike-sharing-demand/test.csv')",No,5,45.0 "print(train_df.shape) print(test_df.shape)",No,5,58.0 "train_df['datetime'] = pd.to_datetime(train_df['datetime']) test_df['datetime'] = pd.to_datetime(test_df['datetime'])",No,5,16.0 "train_df['year'] = train_df['datetime'].apply(lambda x: x.year) train_df['month'] = train_df['datetime'].apply(lambda x: x.month) train_df['day'] = train_df['datetime'].apply(lambda x: x.day) train_df['hour'] = train_df['datetime'].apply(lambda x: x.hour) test_df['year'] = test_df['datetime'].apply(lambda x: x.year) test_df['month'] = test_df['datetime'].apply(lambda x: x.month) test_df['day'] = test_df['datetime'].apply(lambda x: x.day) test_df['hour'] = test_df['datetime'].apply(lambda x: x.hour)",No,5,8.0 "train_df = train_df.drop(['datetime', 'casual', 'registered'], axis=1) test_df = test_df.drop(['datetime'], axis=1)",No,5,10.0 "def rmsle(y, pred): log_y = np.log1p(y) log_pred = np.log1p(pred) squared_error = (log_y - log_pred)**2 rmsle = np.sqrt(np.mean(squared_error)) return rmsle",No,5,84.0 sns.distplot(train_df['count']),No,5,33.0 sns.distplot(np.log1p(train_df['count'])),No,5,33.0 train_df['count'] = np.log1p(train_df['count']),No,5,8.0 "from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.linear_model import LinearRegression, Ridge, Lasso X_train, X_test, y_train, y_test = train_test_split(train_df.drop(['count'], axis=1), train_df['count'], test_size=0.3)",Yes,4,13.0 "lr_reg = LinearRegression() lr_reg.fit(X_train, y_train) pred = lr_reg.predict(X_test) y_test_exp = np.expm1(y_test) pred_exp = np.expm1(pred) print('RMSLE:', rmsle(y_test_exp, pred_exp))",Yes,4,28.0 "coef = pd.Series(lr_reg.coef_, index=X_train.columns) coef_sort = coef.sort_values(ascending=False) sns.barplot(x=coef_sort.values, y=coef_sort.index)",No,5,79.0 "train_df = pd.get_dummies(train_df, columns=['year', 'month', 'day', 'hour', 'holiday', 'workingday', 'season', 'weather']) test_df = pd.get_dummies(test_df, columns=['year', 'month', 'day', 'hour', 'holiday', 'workingday', 'season', 'weather'])",No,5,20.0 "train_df, test_df = train_df.align(test_df, join='left', axis=1) test_df = test_df.drop(['count'], axis=1)",Yes,4,10.0 "X_train, X_test, y_train, y_test = train_test_split(train_df.drop(['count'], axis=1), train_df['count'], test_size=0.3)",No,5,13.0 "lr_reg = LinearRegression() lr_reg.fit(X_train, y_train) pred = lr_reg.predict(X_test) y_test_exp = np.expm1(y_test) pred_exp = np.expm1(pred) print('LinearRegression RMSLE:', rmsle(y_test_exp, pred_exp))",No,3,28.0 "ridge_reg = Ridge(alpha=10) ridge_reg.fit(X_train, y_train) pred = ridge_reg.predict(X_test) y_test_exp = np.expm1(y_test) pred_exp = np.expm1(pred) print('Ridge RMSLE:', rmsle(y_test_exp, pred_exp))",No,4,4.0 "lasso_reg = Lasso(alpha=0.01) lasso_reg.fit(X_train, y_train) pred = lasso_reg.predict(X_test) y_test_exp = np.expm1(y_test) pred_exp = np.expm1(pred) print('Lasso RMSLE:', rmsle(y_test_exp, pred_exp))",No,3,28.0 "coef = pd.Series(lr_reg.coef_, index=X_train.columns) coef_sort = coef.sort_values(ascending=False)[:25] sns.barplot(x=coef_sort.values, y=coef_sort.index)",No,5,79.0 "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from xgboost import XGBRegressor from lightgbm import LGBMRegressor",No,5,22.0 "rf_reg = RandomForestRegressor(n_estimators=500) rf_reg.fit(X_train, y_train) pred = rf_reg.predict(X_test) y_test_exp = np.expm1(y_test) pred_exp = np.expm1(pred) print('RandomForestRegressor RMSLE:', rmsle(y_test_exp, pred_exp))",Yes,3,49.0 "gbm_reg = GradientBoostingRegressor(n_estimators=500) gbm_reg.fit(X_train, y_train) pred = gbm_reg.predict(X_test) y_test_exp = np.expm1(y_test) pred_exp = np.expm1(pred) print('GradientBoostingRegressor RMSLE:', rmsle(y_test_exp, pred_exp))",Yes,3,49.0 "xgb_reg = XGBRegressor(n_estimators=500) xgb_reg.fit(X_train, y_train) pred = xgb_reg.predict(X_test) y_test_exp = np.expm1(y_test) pred_exp = np.expm1(pred) print('XGBRegressor RMSLE:', rmsle(y_test_exp, pred_exp))",Yes,3,49.0 "lgbm_reg = LGBMRegressor(n_estimators=500) lgbm_reg.fit(X_train, y_train) pred = lgbm_reg.predict(X_test) y_test_exp = np.expm1(y_test) pred_exp = np.expm1(pred) print('LGBMRegressor RMSLE:', rmsle(y_test_exp, pred_exp))",No,4,7.0 "X_train = train_df.drop(['count'], axis=1) y_train = train_df['count'] X_test = test_df",No,5,21.0 "print(X_train.shape) print(y_train.shape) print(X_test.shape)",No,5,58.0 "lgbm_reg = LGBMRegressor(n_estimators=500) lgbm_reg.fit(X_train, y_train) pred = lgbm_reg.predict(X_test) pred_exp = np.expm1(pred)",Yes,4,48.0 "submission = pd.read_csv('../input/bike-sharing-demand/sampleSubmission.csv') submission",No,5,45.0 "submission.loc[:, 'count'] = pred_exp submission",No,5,55.0 "b""import numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport os\n\ndef nl():\n print('\\n')\n\nfor f in os.listdir('../input'):\n print(f.ljust(30) + str(round(os.path.getsize('../input/' + f) / 1000000, 2)) + 'MB')""",No,5,88.0 "target = df_train['Demanda_uni_equil'].tolist() def label_plot(title, x, y): plt.title(title) plt.xlabel(x) plt.ylabel(y) plt.hist(target, bins=200, color='blue') label_plot('Distribution of target values', 'Demanda_uni_equil', 'Count') plt.show() print(""Looks like we have some pretty big outliers, let's zoom in and try again"") print('Data with target values under 50: ' + str(round(len(df_train.loc[df_train['Demanda_uni_equil'] <= 50]) / 5000, 2)) + '%') plt.hist(target, bins=50, color='blue', range=(0, 50)) label_plot('Distribution of target values under 50', 'Demanda_uni_equil', 'Count') plt.show() '",Yes,4,45.0 "import numpy as np import pandas as pd import math colum_target=pd.read_csv(""../input/train.csv"",usecols=['Demanda_uni_equil']) m=colum_target['Demanda_uni_equil'].tolist() #print(m) #m=np.mean(np.linalg.logm(colum_target['Demanda_uni_equil'].value+1)) #m=np.exp(math.log(+1).mean()) #mm=colum_target['m'].mean() #print(mm) #result_mean=pd.read_csv(""../input/test.csv"",usecols=['id']) #result_mean['Demanda_uni_equil']=exp(mean) #result_mean.to_csv('result_mean.csv',index=False)'",Yes,4,22.0 "x=np.exp(np.mean(np.log(np.array(m)+1)))-1 print (x) result_logmean=pd.read_csv(""../input/test.csv"",usecols=['id']) result_logmean['Demanda_uni_equil']=x result_logmean.to_csv('result_logmean.csv',index=False)'",Yes,4,25.0 "import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) train_data=pd.read_csv(""../input/train.csv"",usecols=['Producto_ID','Demanda_uni_equil']) '",Yes,5,45.0 "train_data['log_Dem']=np.log(np.array(train_data['Demanda_uni_equil'].tolist())+1) #print(train_data)",No,5,8.0 "test_data['Demanda_uni_equil']=np.exp(log_target)-1 print(test_data) test_data.to_csv('result_groupmean_log.csv',index=False,columns=['id','Demanda_uni_equil'])",Yes,5,25.0 test_data[test_data['Producto_ID']==41]['id'],No,3,41.0 "#mean_data.index #mean_data.ix[41] test_data.shape",No,5,58.0 "import numpy as np import pandas as pd from subprocess import check_output #types={'Semana':np.uint8,'Agencia_ID':np.uint16,'Canal_ID':np.uint8, # 'Ruta_SAK':np.uint16,'Cliente_ID':np.uint32,'Producto_ID':np.uint16, # 'Demanda_uni_equil':np.uint32} types = {'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8, 'Ruta_SAK':np.uint16, 'Cliente_ID':np.uint32, 'Producto_ID':np.uint16, 'Demanda_uni_equil':np.uint32} #train=pd.read_csv('../input/train.csv',usecols=types.keys(),dtype=types) train=pd.read_csv('../input/train.csv',usecols=types.keys(),dtype=types,nrows=1000) print(train.dtype) print(train.info(memery_usage=True)) ",No,3,45.0 "import numpy as np import pandas as pd from subprocess import check_output #types={'Semana':np.uint8,'Agencia_ID':np.uint16,'Canal_ID':np.uint8, # 'Ruta_SAK':np.uint16,'Cliente_ID':np.uint32,'Producto_ID':np.uint16, # 'Demanda_uni_equil':np.uint32} types = {'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8, 'Ruta_SAK':np.uint16, 'Cliente_ID':np.uint32, 'Producto_ID':np.uint16, 'Demanda_uni_equil':np.uint32} #train=pd.read_csv('../input/train.csv',usecols=types.keys(),dtypes=types) train=pd.read_csv('../input/train.csv',usecols=types.keys(),dtypes=types,nrows=1000) print(train.dtype) print(train.info(memery_usage=True))",Yes,3,45.0 "from subprocess import check_output import pandas as pd print(check_output(['ls','.']).decode('utf8')) submission=pd.read_csv('../input/sample_submission.csv') print(submission.shape) print(submission.columns) print(submission.head(20))",Yes,3,45.0 "import numpy as np import pandas as pd import gc import xgboost as xgb import math from sklearn.cross_validation import train_test_split from ml_metrics import rmsle def evalerror(preds, dtrain): labels = dtrain.get_label() assert len(preds) == len(labels) labels = labels.tolist() preds = preds.tolist() terms_to_sum = [(math.log(labels[i] + 1) - math.log(max(0,preds[i]) + 1)) ** 2.0 for i,pred in enumerate(labels)] return 'error', (sum(terms_to_sum) * (1.0/len(preds))) ** 0.5 nrows=10000 train=pd.read_csv('../input/train.csv',nrows=nrows) test=pd.read_csv('../input/test.csv',nrows=nrows) print(train.columns) print(test.columns) print(test.columns.values) ids=test['id'] test=test.drop(['id'],axis=1) y=train['Demanda_uni_equil'] X=train[test.columns.values] #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1729) X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1000) params = {} params['objective'] = ""reg:linear"" params['eta'] = 0.05 params['max_depth'] = 5 params['subsample'] = 0.8 params['colsample_bytree'] = 0.6 params['silent'] = True print ('') test_preds = np.zeros(test.shape[0]) xg_train = xgb.DMatrix(X_train, label=y_train) xg_test = xgb.DMatrix(X_test) watchlist = [(xg_train, 'train')] num_rounds = 100 xgclassifier = xgb.train(params, xg_train, num_rounds, watchlist, feval = evalerror, early_stopping_rounds= 20, verbose_eval = 10) preds = xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration) print ('RMSLE Score:', rmsle(y_test, preds)) fxg_test = xgb.DMatrix(test) fold_preds = np.around(xgclassifier.predict(fxg_test, ntree_limit=xgclassifier.best_iteration), decimals = 1) test_preds += fold_preds submission = pd.DataFrame({'id':ids, 'Demanda_uni_equil': test_preds}) submission.to_csv('submission.csv', index=False)'",Yes,3,45.0 test['count'] = (np.exp(rf.predict(test[feats])) - 1),No,5,8.0 "test[['datetime', 'count']].to_csv('submission.csv', index=False)",No,5,25.0 "__author__ = 'ZFTurbo: https://kaggle.com/zfturbo' import datetime import pandas as pd import numpy as np from sklearn.cross_validation import train_test_split import xgboost as xgb import random import zipfile import time import shutil from sklearn.metrics import log_loss random.seed(2016) def run_xgb(train, test, features, target, random_state=0): eta = 0.3 max_depth = 6 subsample = 1 colsample_bytree = 0.7 start_time = time.time() print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree)) params = { ""objective"": ""multi:softprob"", ""num_class"": 12, ""booster"" : ""gbtree"", ""eval_metric"": ""mlogloss"", ""eta"": eta, ""max_depth"": max_depth, ""subsample"": subsample, ""colsample_bytree"": colsample_bytree, ""silent"": 1, ""seed"": random_state, } num_boost_round = 500 early_stopping_rounds = 50 test_size = 0.3 X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state) print('Length train:', len(X_train.index)) print('Length valid:', len(X_valid.index)) y_train = X_train[target] y_valid = X_valid[target] dtrain = xgb.DMatrix(X_train[features], y_train) dvalid = xgb.DMatrix(X_valid[features], y_valid) watchlist = [(dtrain, 'train'), (dvalid, 'eval')] gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True) print(""Validating..."") check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration) score = log_loss(y_valid.tolist(), check) print(""Predict test set..."") test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration) print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2))) return test_prediction.tolist(), score def create_submission(score, test, prediction): # Make Submission now = datetime.datetime.now() sub_file = 'submission_' + str(score) + '_' + str(now.strftime(""%Y-%m-%d-%H-%M"")) + '.csv' print('Writing submission: ', sub_file) f = open(sub_file, 'w') f.write('device_id,F23-,F24-26,F27-28,F29-32,F33-42,F43+,M22-,M23-26,M27-28,M29-31,M32-38,M39+\ ') total = 0 test_val = test['device_id'].values for i in range(len(test_val)): str1 = str(test_val[i]) for j in range(12): str1 += ',' + str(prediction[i][j]) str1 += '\ ' total += 1 f.write(str1) f.close() def map_column(table, f): labels = sorted(table[f].unique()) mappings = dict() for i in range(len(labels)): mappings[labels[i]] = i table = table.replace({f: mappings}) return table def read_train_test(): # Events print('Read events...') events = pd.read_csv(""../input/events.csv"", dtype={'device_id': np.str}) events['counts'] = events.groupby(['device_id'])['event_id'].transform('count') events_small = events[['device_id', 'counts']].drop_duplicates('device_id', keep='first') # Phone brand print('Read brands...') pbd = pd.read_csv(""../input/phone_brand_device_model.csv"", dtype={'device_id': np.str}) pbd.drop_duplicates('device_id', keep='first', inplace=True) pbd = map_column(pbd, 'phone_brand') pbd = map_column(pbd, 'device_model') # Train print('Read train...') train = pd.read_csv(""../input/gender_age_train.csv"", dtype={'device_id': np.str}) train = map_column(train, 'group') train = train.drop(['age'], axis=1) train = train.drop(['gender'], axis=1) train = pd.merge(train, pbd, how='left', on='device_id', left_index=True) train = pd.merge(train, events_small, how='left', on='device_id', left_index=True) train.fillna(-1, inplace=True) # Test print('Read test...') test = pd.read_csv(""../input/gender_age_test.csv"", dtype={'device_id': np.str}) test = pd.merge(test, pbd, how='left', on='device_id', left_index=True) test = pd.merge(test, events_small, how='left', on='device_id', left_index=True) test.fillna(-1, inplace=True) # Features features = list(test.columns.values) features.remove('device_id') return train, test, features train, test, features = read_train_test() print('Length of train: ', len(train)) print('Length of test: ', len(test)) print('Features [{}]: {}'.format(len(features), sorted(features))) test_prediction, score = run_xgb(train, test, features, 'group') print(""LS: {}"".format(round(score, 5))) create_submission(score, test, test_prediction) '",No,2,22.0 "import random import datetime import numpy as np import pandas as pd import xgboost as xgb from scipy.sparse import csr_matrix, hstack from sklearn.preprocessing import LabelEncoder from sklearn.cross_validation import train_test_split from subprocess import check_output random.seed(2010) print(check_output(['ls', '../input']).decode('utf8'))",No,4,88.0 "def read(file_name): return pd.read_csv('../input/{}.csv'.format(file_name)) app_events = read('app_events') app_labels = read('app_labels') events = read('events') gender_age_test = read('gender_age_test') gender_age_train = read('gender_age_train') label_categories = read('label_categories') phone_brand_device_model = read('phone_brand_device_model') sample_submission = read('sample_submission') app_le = LabelEncoder() app_le.fit(app_events['app_id']) device_model_le = LabelEncoder() device_model_le.fit(phone_brand_device_model['device_model'])",Yes,4,45.0 "import pandas as pd import numpy as np %matplotlib inline import seaborn as sns import matplotlib.pyplot as plt import os from sklearn.preprocessing import LabelEncoder from scipy.sparse import csr_matrix, hstack from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import StratifiedKFold from sklearn.metrics import log_loss",No,5,23.0 "datadir = '../input' gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'), index_col='device_id') gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'), index_col = 'device_id') phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv')) # Get rid of duplicate device ids in phone phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id') events = pd.read_csv(os.path.join(datadir,'events.csv'), parse_dates=['timestamp'], index_col='event_id') appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'), usecols=['event_id','app_id','is_active'], dtype={'is_active':bool}) applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))",No,4,45.0 "gatrain['trainrow'] = np.arange(gatrain.shape[0]) gatest['testrow'] = np.arange(gatest.shape[0])",No,5,8.0 "m = phone.phone_brand.str.cat(phone.device_model) modelencoder = LabelEncoder().fit(m) phone['model'] = modelencoder.transform(m) gatrain['model'] = phone['model'] gatest['model'] = phone['model'] Xtr_model = csr_matrix((np.ones(gatrain.shape[0]), (gatrain.trainrow, gatrain.model))) Xte_model = csr_matrix((np.ones(gatest.shape[0]), (gatest.testrow, gatest.model))) print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))",Yes,4,20.0 "applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())] applabels['app'] = appencoder.transform(applabels.app_id) labelencoder = LabelEncoder().fit(applabels.label_id) applabels['label'] = labelencoder.transform(applabels.label_id) nlabels = len(labelencoder.classes_)",No,4,20.0 "devicelabels = (deviceapps[['device_id','app']] .merge(applabels[['app','label']]) .groupby(['device_id','label'])['app'].agg(['size']) .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True) .merge(gatest[['testrow']], how='left', left_index=True, right_index=True) .reset_index()) devicelabels.head()",No,3,32.0 "d = devicelabels.dropna(subset=['trainrow']) Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), shape=(gatrain.shape[0],nlabels)) d = devicelabels.dropna(subset=['testrow']) Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), shape=(gatest.shape[0],nlabels)) print('Labels data: train shape {}, test shape {}'.format(Xtr_label.shape, Xte_label.shape))",Yes,4,58.0 "Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr') Xtest = hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr') print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))",No,4,12.0 Xtrain,No,5,41.0 "targetencoder = LabelEncoder().fit(gatrain.group) y = targetencoder.transform(gatrain.group) nclasses = len(targetencoder.classes_)",No,5,20.0 "dtrain = xgb.DMatrix(Xtrain, y)",No,2,7.0 "params = { ""eta"": 0.1, ""booster"": ""gblinear"", ""objective"": ""multi:softprob"", ""alpha"": 4, ""lambda"": 0, ""silent"": 1, ""seed"": 1233, ""num_class"": 12, ""eval_metric"": ""mlogloss"" }",No,5,59.0 "xgb.cv(params, dtrain, num_boost_round=50, #early_stopping_rounds = 5, maximize = False)",No,5,28.0 "from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier() #model.fit(Xtrain, y) hmm this is going to take too long.",No,5,4.0 "model = xgb.train(params, dtrain, num_boost_round=25)",No,5,7.0 "model.predict(dtest) pred = pd.DataFrame(model.predict(dtest), index = gatest.index, columns=targetencoder.classes_)",No,5,48.0 pred.head(),No,5,41.0 "pred.to_csv('xgb_subm.csv',index=True) ",No,5,25.0 "brandencoder = LabelEncoder().fit(phone.phone_brand) phone['brand'] = brandencoder.transform(phone['phone_brand']) gatrain['brand'] = phone['brand'] gatest['brand'] = phone['brand'] Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]), (gatrain.trainrow, gatrain.brand))) Xte_brand = csr_matrix((np.ones(gatest.shape[0]), (gatest.testrow, gatest.brand))) print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))",No,4,20.0 "appencoder = LabelEncoder().fit(appevents.app_id) appevents['app'] = appencoder.transform(appevents.app_id) napps = len(appencoder.classes_) deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True) .groupby(['device_id','app'])['app'].agg(['size']) .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True) .merge(gatest[['testrow']], how='left', left_index=True, right_index=True) .reset_index()) deviceapps.head()",No,4,12.0 "clf = LogisticRegression(C=0.02, multi_class='multinomial',solver='lbfgs') clf.fit(Xtrain, y) pred = pd.DataFrame(clf.predict_proba(Xtest), index = gatest.index, columns=targetencoder.classes_) pred.head()",No,4,49.0 "pred.to_csv('logreg_subm.csv',index=True)",No,5,25.0 "import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import os from sklearn.preprocessing import LabelEncoder from scipy.sparse import csr_matrix, hstack import xgboost as xgb from sklearn.cross_validation import StratifiedKFold from sklearn.metrics import log_loss",No,5,22.0 "# function to bin the timestamp in time of day def bintod(x): if x < 3: return 0 elif x < 8: return 1 elif x < 20: return 2 elif x < 23: return 3 else: return 0 # functions to assign region based on latitude and longitude def lngregion(x): if x < 80: return 1 elif x < 90: return 2 elif x < 100: return 3 elif x < 110: return 4 elif x < 120: return 5 elif x < 130: return 6 elif x < 140: return 7 else: return 0 nlng = 8 nlat = 9 def latregion(x): if x < 20: return 1 elif x < 25: return 2 elif x < 30: return 3 elif x < 35: return 4 elif x < 40: return 5 elif x < 45: return 6 elif x < 50: return 7 elif x < 55: return 8 else: return 0 ",No,5,20.0 "datadir = '../input' gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'), index_col='device_id') gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'), index_col = 'device_id') phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv')) # Get rid of duplicate device ids in phone phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id') events = pd.read_csv(os.path.join(datadir,'events.csv'), parse_dates=['timestamp'], index_col='event_id') appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'), usecols=['event_id','app_id','is_active'], dtype={'is_active':bool}) applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv')) labelcat = pd.read_csv(os.path.join(datadir,'label_categories.csv')) labelcat['category']=labelcat['category'].fillna('label-missing') labelcat.head()",No,4,45.0 "# clean lat, long info, 0s are missing, also set out of china to missing events['longitude'] = events['longitude'].round(0) events['latitude'] = events['latitude'].round(0) #set out of China to missing along with 0s events['longitude'] = events['longitude'].clip_lower(73.0).replace(73.0, np.NaN) events['longitude'] = events['longitude'].clip_upper(135.0).replace(135.0, np.NaN) events['latitude'] = events['latitude'].clip_lower(15.0).replace(15.0, np.NaN) events['latitude'] = events['latitude'].clip_upper(60.0).replace(60.0, np.NaN) # lot of missing values - replace them with mode (most common lat, long) events['latitude2'] =events.groupby(['device_id'])['latitude'].transform(lambda x: x.mode()) events['longitude2'] =events.groupby(['device_id'])['longitude'].transform(lambda x: x.mode())",No,3,17.0 "# lat long location for each device events_latlng = events[['device_id', 'latitude2','longitude2']].drop_duplicates('device_id', keep='first') events_latlng = events_latlng.set_index('device_id') print('Number of devices with some lat long info',len(events_latlng['latitude2'])) print('out of that missing longitude: ', sum(events_latlng['longitude2'].isnull())) print('out of that missing latitude: ', sum(events_latlng['latitude2'].isnull())) events_latlng['lng_region'] = events_latlng['longitude2'].apply(lngregion) events_latlng['lat_region'] = events_latlng['latitude2'].apply(latregion) print (""Frequencies longitude region:"" '\ ', events_latlng['lng_region'].value_counts()) print (""Frequencies latitude region:"" '\ ', events_latlng['lat_region'].value_counts())'",No,4,20.0 "appencoder = LabelEncoder().fit(appevents.app_id) appevents['app'] = appencoder.transform(appevents.app_id) napps = len(appencoder.classes_) deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True) .groupby(['device_id','app'])['app'].agg(['size']) .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True) .merge(gatest[['testrow']], how='left', left_index=True, right_index=True) .reset_index())",No,4,20.0 "applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())] applabels['app'] = appencoder.transform(applabels.app_id) labelcat = labelcat.loc[labelcat.label_id.isin(applabels.label_id.unique())] labelencoder = LabelEncoder().fit(labelcat.category) labelcat['label'] = labelencoder.transform(labelcat.category) nlabels = len(labelencoder.classes_) print('number of unique labels:',nlabels) print('recoded label categories', '/n',labelcat.head(n=20)) applabels=applabels.merge(labelcat[['label','label_id']], how='left',left_on='label_id',right_on='label_id') devicelabels = (deviceapps[['device_id','app']] .merge(applabels[['app','label']]) .groupby(['device_id','label'])['app'].agg(['size']) .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True) .merge(gatest[['testrow']], how='left', left_index=True, right_index=True) .reset_index()) devicelabels.head()",Yes,4,20.0 "Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label, Xtr_tod, Xtr_dow, Xtr_lat, Xtr_lng), format='csr') Xtest = hstack((Xte_brand, Xte_model, Xte_app, Xte_label, Xte_tod, Xte_dow, Xte_lat, Xte_lng), format='csr') print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))",No,4,11.0 "targetencoder = LabelEncoder().fit(gatrain.group) y = targetencoder.transform(gatrain.group)",No,5,20.0 "########## XGBOOST ########## params = {} params['booster'] = 'gblinear' params['objective'] = ""multi:softprob"" params['eval_metric'] = 'mlogloss' params['eta'] = 0.005 params['num_class'] = 12 params['lambda'] = 3 params['alpha'] = 2'",No,5,59.0 "clf = xgb.train(params, d_train, 1000, watchlist, early_stopping_rounds=25) pred = clf.predict(xgb.DMatrix(Xtest)) pred = pd.DataFrame(pred, index = gatest.index, columns=targetencoder.classes_) pred.head() pred.to_csv('sparse_xgb_v11.csv', index=True)",Yes,4,48.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) %matplotlib inline import seaborn as sns import matplotlib.pyplot as plt import os from sklearn.preprocessing import LabelEncoder from scipy.sparse import csr_matrix, hstack from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import StratifiedKFold from sklearn.metrics import log_loss # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory from subprocess import check_output print(check_output([""ls"", ""../input""]).decode(""utf8"")) # Any results you write to the current directory are saved as output.",No,5,88.0 "b""# device_iddevice\n# ga_trainga_testapp_eventstraintest\nga_train = pd.read_csv('../input/gender_age_train.csv', index_col='device_id')\nga_test = pd.read_csv('../input/gender_age_test.csv', index_col='device_id') \n# event_idapp_eventsappevent_idevent_iddevice_idga_traindevice_idgenderage\nevents = pd.read_csv('../input/events.csv', index_col='event_id', parse_dates=['timestamp']) \n# appis_installed1\napp_events = pd.read_csv('../input/app_events.csv', usecols=['event_id','app_id','is_active'])\n# phone_branddevice_model\n##### \ndevice_brand = pd.read_csv('../input/phone_brand_device_model.csv')\ndevice_brand = device_brand.drop_duplicates('device_id').set_index('device_id')\napp_labels = pd.read_csv('../input/app_labels.csv')""",No,4,45.0 "import numpy as np import pandas as pd import os import gc nrows=100000 train=pd.read_csv('../input/train.csv',nrows=nrows) print(train.shape) print(train.columns) data=train.copy() data['target']=data['Demanda_uni_equil'] data.drop(['Demanda_uni_equil'],axis=1,inplace=True) nCliente_ID = pd.DataFrame(pd.groupby(data,['Cliente_ID','Semana'])['target'].count()) print(nCliente_ID.shape) print(nCliente_ID.columns) print(nCliente_ID.head(2)) nCliente_ID = nCliente_ID.reset_index() print(nCliente_ID.shape) print(nCliente_ID.columns) print(nCliente_ID.head(2)) nCliente_ID.rename(columns={'target': 'nCliente_ID'}, inplace=True) print(nCliente_ID.shape) print(nCliente_ID.columns) print(nCliente_ID.head(2)) nCliente_ID = pd.DataFrame(pd.groupby(nCliente_ID,['Cliente_ID'])['nCliente_ID'].mean()) print(nCliente_ID.shape) print(nCliente_ID.columns) print(nCliente_ID.head(2)) nCliente_ID = nCliente_ID.reset_index() print(nCliente_ID.shape) print(nCliente_ID.columns) print(nCliente_ID.head(2)) data = pd.merge(data, nCliente_ID, how='left', left_on=['Cliente_ID'], right_on=['Cliente_ID'], left_index=False, right_index=False, sort=True, suffixes=('_x', '_y'), copy=False) print(data.columns) print(data.head(50)) del nCliente_ID gc.collect() print('merge completo nCliente_ID') print(data.shape[0])",Yes,2,22.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory from subprocess import check_output print(check_output([""ls"", ""../input""]).decode(""utf8"")) # Any results you write to the current directory are saved as output. import datetime import time from collections import defaultdict import gc def run_solution(): print('Preparing arrays...') f = open(""../input/train.csv"", ""r"") f.readline() total = 0 client_product_arr = defaultdict(int) client_product_arr_count = defaultdict(int) client_arr = defaultdict(int) client_arr_count = defaultdict(int) product_arr = defaultdict(int) product_arr_count = defaultdict(int) # Calc counts avg_target = 0.0 while 1: line = f.readline().strip() total += 1 if total % 10000000 == 0: print('Read {} lines...'.format(total)) if line == '': break arr = line.split("","") week = int(arr[0]) agency = arr[1] canal_id = arr[2] ruta_sak = arr[3] cliente_id = int(arr[4]) producto_id = int(arr[5]) vuh = arr[6] vh = arr[7] dup = arr[8] dp = arr[9] target = int(arr[10]) avg_target += target client_product_arr[(cliente_id, producto_id)] += target client_product_arr_count[(cliente_id, producto_id)] += 1 client_arr[cliente_id] += target client_arr_count[cliente_id] += 1 product_arr[producto_id] += target product_arr_count[producto_id] += 1 f.close() avg_target /= total print('Average target: ', avg_target) gc.collect() print('Generate submission...') now = datetime.datetime.now() path = 'submission_' + str(now.strftime(""%Y-%m-%d-%H-%M"")) + '.csv' out = open(path, ""w"") f = open(""../input/test.csv"", ""r"") f.readline() total = 0 out.write(""id,Demanda_uni_equil\ "") index_both = 0 index_client = 0 index_product = 0 index_empty = 0 while 1: line = f.readline().strip() total += 1 if total % 10000000 == 0: print('Write {} lines...'.format(total)) if line == '': break arr = line.split("","") id = arr[0] week = int(arr[1]) agency = arr[2] canal_id = arr[3] ruta_sak = arr[4] cliente_id = int(arr[5]) producto_id = int(arr[6]) out.write(str(id) + ',') if (cliente_id, producto_id) in client_product_arr: val = client_product_arr[(cliente_id, producto_id)]/client_product_arr_count[(cliente_id, producto_id)] out.write(str(val)) index_both += 1 elif cliente_id in client_arr: val = client_arr[cliente_id]/client_arr_count[cliente_id] out.write(str(val)) index_client += 1 elif producto_id in product_arr: val = product_arr[producto_id]/product_arr_count[producto_id] out.write(str(val)) index_product += 1 else: out.write(str(avg_target)) index_empty += 1 out.write(""\ "") print('Both: {}'.format(index_both)) print('Client: {}'.format(index_client)) print('Product: {}'.format(index_product)) print('Empty: {}'.format(index_empty)) out.close() f.close() start_time = time.time() #run_solution() print(""Elapsed time overall: %s seconds"" % (time.time() - start_time))'",Yes,5,53.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import seaborn as sns import matplotlib.pyplot as plt import matplotlib.cm as cm from sklearn.preprocessing import LabelEncoder from sklearn.cross_validation import KFold from sklearn.metrics import log_loss # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory import os print(os.listdir(""../input"")) # Any results you write to the current directory are saved as output.",No,5,88.0 "df_gender_age_test = pd.read_csv('../input/gender_age_test.csv', dtype={'device_id': np.str}) df_gender_age_train = pd.read_csv('../input/gender_age_train.csv', dtype={'device_id': np.str}) df_app_events = pd.read_csv('../input/app_events.csv', dtype={'app_id': np.str}) df_events = pd.read_csv('../input/events.csv', dtype={'device_id': np.str}) df_app_labels = pd.read_csv('../input/app_labels.csv', dtype={'app_id': np.str}) df_label_categories = pd.read_csv('../input/label_categories.csv') df_phone_brands = pd.read_csv('../input/phone_brand_device_model.csv', dtype={'device_id': np.str})",No,5,45.0 df_gender_age_test.head(),No,5,41.0 "df_gender_age_test.device_id.nunique(), df_gender_age_test.shape[0]",Yes,5,54.0 df_gender_age_train.head(),No,5,41.0 "df_gender_age_train.device_id.nunique(), df_gender_age_train.shape[0]",Yes,5,54.0 df_gender_age_train.info(),No,5,40.0 df_gender_age_train.describe(include='all').T,No,5,40.0 "df_ga_full = pd.concat([df_gender_age_train, df_gender_age_test], axis=0, sort=False)",No,5,11.0 df_ga_full.device_id.nunique(),No,5,54.0 df_events.head(),No,5,41.0 "df_events.event_id.nunique(), df_events.device_id.nunique(), df_events.shape[0]",Yes,3,58.0 df_app_events.head(),No,5,41.0 "df_app_events.event_id.nunique(), df_app_events.shape[0]",Yes,4,54.0 "# df_gender_age_train.device_id[] in_train_events = df_events[df_events.device_id.isin(set(df_gender_age_train.device_id) & set(df_events.device_id))] in_train_app_events = df_app_events[df_app_events.event_id.isin(in_train_events.event_id)] in_train_app_events.event_id.nunique(), in_train_app_events.event_id.size, len(in_train_events)",Yes,3,14.0 "in_test_events = df_events[df_events.device_id.isin(set(df_gender_age_test.device_id) & set(df_events.device_id))] in_test_app_events = df_app_events[df_app_events.event_id.isin(in_test_events.event_id)] in_train_app_events.event_id.nunique(), in_train_app_events.event_id.size, len(in_train_events)",Yes,3,14.0 "del in_train_events del in_train_app_events del in_test_events del in_test_app_events",No,5,10.0 "import gc gc.collect()",No,5,23.0 df_app_labels.head(),No,5,41.0 "df_app_labels.app_id.nunique(), df_app_labels.label_id.nunique(), df_app_labels.shape[0]",Yes,5,54.0 df_label_categories.head(),No,5,41.0 "df_label_categories.category.nunique(), df_label_categories.shape[0]",Yes,5,54.0 df_phone_brands.head(),No,5,41.0 "df_phone_brands.device_id.nunique(), df_phone_brands.shape[0]",Yes,5,54.0 "df_phone_brands.drop_duplicates(subset='device_id', inplace=True)",No,5,19.0 a.shape[0],No,5,58.0 "df_phone_brands.phone_brand = df_phone_brands.phone_brand.map(str.strip).map(str.lower) df_phone_brands.device_model = df_phone_brands.device_model.map(str.strip).map(str.lower) df_phone_brands.device_model = df_phone_brands.phone_brand.str.cat(df_phone_brands.device_model)",No,5,78.0 df_phone_brands.info(),No,5,40.0 df_phone_brands.describe(),No,5,40.0 "df_ga_full = df_ga_full.merge(df_phone_brands, how='left', on='device_id')",No,5,32.0 "df_train = df_ga_full.loc[df_ga_full.device_id.isin(df_gender_age_train.device_id.tolist())] df_test = df_ga_full.loc[df_ga_full.device_id.isin(df_gender_age_test.device_id.tolist())]",No,5,13.0 "# sns.kdeplot(df_gender_age_train.age) fig = plt.figure(figsize=(9, 6)) sns.distplot(df_gender_age_train.age, ax=fig.gca()) plt.title('Age distribution') sns.despine()",No,5,33.0 "fig = plt.figure(figsize=(7, 4)) sns.barplot(x = df_gender_age_train.gender.value_counts().index, y=df_gender_age_train.gender.value_counts().values, ax=fig.gca()) sns.despine() plt.title('Gender distribution')",No,5,33.0 "df_gender_age_train.groupby('group').device_id.size().sort_index(ascending=False).plot.barh(title='Age Gender Group Distribution') sns.despine()",No,5,33.0 "share_majority = market_share[~(market_share>0.95)].index.tolist() share_others = market_share[market_share>0.95].index.tolist() share_majority2 = market_share2[~(market_share2>0.60)].index.tolist() share_others2 = market_share2[market_share2>0.60].index.tolist()",No,3,13.0 "# https://seaborn.pydata.org/tutorial/categorical.html # sns.swarmplot(x=""phone_brand"", y=""age"", hue=""gender"", data=df_train); fig = plt.figure(figsize=(20, 6)) ax = sns.boxplot(x=""phone_brand"", y=""age"", hue=""gender"", data=df_train[df_train.phone_brand.isin(share_majority)].sort_values('age'), ax=fig.gca()); ax.set_xticklabels(share_majority, rotation=30); str(share_majority)'",No,4,33.0 "fig = plt.figure(figsize=(20, 6)) ax = sns.boxplot(x=""device_model"", y=""age"", hue=""gender"", data=df_train[df_train.device_model.isin(share_majority2)].sort_values('age'), ax=fig.gca()); ax.set_xticklabels(ax.get_xticklabels(), rotation=30); str(share_majority2)'",No,4,33.0 "b""# groupsgroup\n# df_app_labels.groupby('app_id').label_id.groups\ndf_app_labels = df_app_labels.groupby('app_id').label_id.apply(lambda x: ' '.join(str(s) for s in x))\ndf_app_labels.head()""",Yes,4,78.0 df_app_events = df_app_events.groupby('event_id').app_lab.apply(lambda x: ' '.join(str(s) for s in x)),No,4,78.0 "del df_label_categories del df_app_labels",No,4,10.0 df_events['app_lab'] = df_events.event_id.map(df_app_events),No,5,8.0 df_events['timestamp'] = pd.to_datetime(df_events['timestamp']),No,5,16.0 df_events['hour'] = df_events['timestamp'].dt.hour,No,5,8.0 time_large = df_events.groupby('device_id')['hour'].apply(lambda x: max(x)),No,5,60.0 time_small = df_events.groupby('device_id')['hour'].apply(lambda x: min(x)),No,5,60.0 "from collections import Counter time_most = df_events.groupby('device_id')['hour'].apply(lambda x: Counter(x).most_common(1)[0][0])",Yes,3,22.0 del df_app_events,No,4,10.0 "df_events.app_lab = df_events.app_lab.fillna('Missing') df_events = df_events.groupby('device_id').app_lab.apply(lambda x: ' '.join(str(s) for s in x))",Yes,4,17.0 "df_ga_full['app_lab']= df_ga_full['device_id'].map(df_events) df_ga_full['time_most']= df_ga_full['device_id'].map(time_most) df_ga_full['time_large']= df_ga_full['device_id'].map(time_large) df_ga_full['time_small']= df_ga_full['device_id'].map(time_small)",No,5,20.0 df_ga_full.head(),No,5,41.0 "del df_train del df_test del df_events del df_phone_brands del time_large del time_most del time_small",No,4,10.0 "fig = plt.figure(figsize=(20, 6)) ax = sns.boxplot(x=""time_most"", y=""age"", hue=""gender"", data=df_ga_full, ax=fig.gca()); ax.set_xticklabels(ax.get_xticklabels(), rotation=30);",No,5,33.0 "fig = plt.figure(figsize=(20, 6)) ax = sns.boxplot(x=""time_large"", y=""age"", hue=""gender"", data=df_ga_full, ax=fig.gca()); ax.set_xticklabels(ax.get_xticklabels(), rotation=30);",No,5,33.0 "fig = plt.figure(figsize=(20, 6)) ax = sns.boxplot(x=""time_small"", y=""age"", hue=""gender"", data=df_ga_full, ax=fig.gca()); ax.set_xticklabels(ax.get_xticklabels(), rotation=30);",No,5,33.0 "b""from sklearn.feature_extraction.text import CountVectorizer\n\nvectorizer = CountVectorizer(binary=True)\n# NA\ndf_app_lab_vectorized = vectorizer.fit_transform(df_ga_full['app_lab'].fillna('Missing')) \n# label category feature names\nstr(vectorizer.get_feature_names())""",Yes,5,8.0 "app_labels = pd.DataFrame(df_app_lab_vectorized.toarray(), columns=vectorizer.get_feature_names(), index=df_ga_full.device_id) app_labels.head(3)",No,4,12.0 "df_ga_full = df_ga_full.merge(app_labels, how='left', left_on='device_id', right_index=True)",No,5,32.0 df_ga_full.head(3),No,5,41.0 "df_ga_full = pd.get_dummies(df_ga_full.drop(columns=['gender', 'age', 'app_lab']), columns=['phone_brand', 'device_model', 'time_most', 'time_large', 'time_small'])",No,5,20.0 df_ga_full.shape,No,5,58.0 df_ga_full.info(),No,5,40.0 df_ga_full.describe(),No,5,40.0 "train = df_ga_full[df_ga_full.device_id.isin(df_gender_age_train.device_id)] test = df_ga_full[df_ga_full.device_id.isin(df_gender_age_test.device_id)].drop(columns=['group']) X = train.drop(columns=['group']) encoder = LabelEncoder() Y = encoder.fit_transform(train['group'])",Yes,5,21.0 "X.shape, Y.shape",No,5,58.0 "from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import cross_val_score # scores = cross_val_score(LogisticRegression(), X, Y, scoring='neg_log_loss',cv=10, verbose=1)",No,5,22.0 "import xgboost as xgb from sklearn.model_selection import train_test_split X.set_index('device_id', inplace=True) X_train, X_val, y_train, y_val = train_test_split(X, Y, train_size=.80) ################## # XGBoost ################## dtrain = xgb.DMatrix(X_train, y_train) dvalid = xgb.DMatrix(X_val, y_val) params = { ""objective"": ""multi:softprob"", ""num_class"": 12, # Y12 ""booster"": ""gbtree"", # gbtree,gbliner ""eval_metric"": ""mlogloss"", ""eta"": 0.3, # GBM learning rate ""silent"": 0, # 10 } watchlist = [(dtrain, 'train'), (dvalid, 'eval')] gbm = xgb.train(params, dtrain, 140, evals=watchlist, verbose_eval=True)'",Yes,3,7.0 "test.set_index('device_id', inplace=True) y_pre = gbm.predict(xgb.DMatrix(test), ntree_limit=gbm.best_iteration) # scores = cross_val_score(RandomForestClassifier(n_est",No,5,48.0 pd.read_csv('../input/sample_submission.csv').head(),No,5,41.0 "result = pd.DataFrame(y_pre, index=test.index, columns=encoder.classes_) result.head()",Yes,4,12.0 result.to_csv('./predict_prob.csv'),No,5,25.0 pd.read_csv('./predict_prob.csv').head(),No,5,45.0 "import gc import numpy as np import pandas as pd import xgboost as xgb from sklearn.cross_validation import StratifiedKFold from sklearn.metrics import matthews_corrcoef from operator import itemgetter # per raddar, all date features except for stations 24+25 are identical def get_date_features(): directory = '../input/' trainfile = 'train_date.csv' for i, chunk in enumerate(pd.read_csv(directory + trainfile, chunksize=1, low_memory=False)): features = list(chunk.columns) break seen = np.zeros(52) rv = [] for f in features: if f == 'Id': rv.append(f) continue station = int(f.split('_')[1][1:]) if seen[station]: continue seen[station] = 1 rv.append(f) return rv usefuldatefeatures = get_date_features() def get_mindate(): directory = '../input/' trainfile = 'train_date.csv' testfile = 'test_date.csv' features = None subset = None for i, chunk in enumerate(pd.read_csv(directory + trainfile, usecols=usefuldatefeatures, chunksize=50000, low_memory=False)): print(i) if features is None: features = list(chunk.columns) features.remove('Id') df_mindate_chunk = chunk[['Id']].copy() df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values if subset is None: subset = df_mindate_chunk.copy() else: subset = pd.concat([subset, df_mindate_chunk]) del chunk gc.collect() for i, chunk in enumerate(pd.read_csv(directory + testfile, usecols=usefuldatefeatures, chunksize=50000, low_memory=False)): print(i) df_mindate_chunk = chunk[['Id']].copy() df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values subset = pd.concat([subset, df_mindate_chunk]) del chunk gc.collect() return subset df_mindate = get_mindate() df_mindate.sort_values(by=['mindate', 'Id'], inplace=True) df_mindate['mindate_id_diff'] = df_mindate.Id.diff() midr = np.full_like(df_mindate.mindate_id_diff.values, np.nan) midr[0:-1] = -df_mindate.mindate_id_diff.values[1:] df_mindate['mindate_id_diff_reverse'] = midr def mcc(tp, tn, fp, fn): sup = tp * tn - fp * fn inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) if inf == 0: return 0 else: return sup / np.sqrt(inf) def eval_mcc(y_true, y_prob, show=False): idx = np.argsort(y_prob) y_true_sort = y_true[idx] n = y_true.shape[0] nump = 1.0 * np.sum(y_true) # number of positive numn = n - nump # number of negative tp = nump tn = 0.0 fp = numn fn = 0.0 best_mcc = 0.0 best_id = -1 mccs = np.zeros(n) for i in range(n): if y_true_sort[i] == 1: tp -= 1.0 fn += 1.0 else: fp -= 1.0 tn += 1.0 new_mcc = mcc(tp, tn, fp, fn) mccs[i] = new_mcc if new_mcc >= best_mcc: best_mcc = new_mcc best_id = i if show: best_proba = y_prob[idx[best_id]] y_pred = (y_prob > best_proba).astype(int) return best_proba, best_mcc, y_pred else: return best_mcc def mcc_eval(y_prob, dtrain): y_true = dtrain.get_label() best_mcc = eval_mcc(y_true, y_prob) return 'MCC', best_mcc def create_feature_map(features): outfile = open('xgb.fmap', 'w') for i, feat in enumerate(features): outfile.write('{0}\\t{1}\\tq\ '.format(i, feat)) outfile.close() def get_importance(gbm, features): create_feature_map(features) importance = gbm.get_fscore(fmap='xgb.fmap') importance = sorted(importance.items(), key=itemgetter(1), reverse=True) return importance def LeaveOneOut(data1, data2, columnName, useLOO=False): grpOutcomes = data1.groupby(columnName)['Response'].mean().reset_index() grpCount = data1.groupby(columnName)['Response'].count().reset_index() grpOutcomes['cnt'] = grpCount.Response if(useLOO): grpOutcomes = grpOutcomes[grpOutcomes.cnt > 1] grpOutcomes.drop('cnt', inplace=True, axis=1) outcomes = data2['Response'].values x = pd.merge(data2[[columnName, 'Response']], grpOutcomes, suffixes=('x_', ''), how='left', on=columnName, left_index=True)['Response'] if(useLOO): x = ((x*x.shape[0])-outcomes)/(x.shape[0]-1) # x = x + np.random.normal(0, .01, x.shape[0]) return x.fillna(x.mean()) def GrabData(): directory = '../input/' trainfiles = ['train_categorical.csv', 'train_date.csv', 'train_numeric.csv'] testfiles = ['test_categorical.csv', 'test_date.csv', 'test_numeric.csv'] cols = [['Id', 'L1_S24_F1559', 'L3_S32_F3851', 'L1_S24_F1827', 'L1_S24_F1582', 'L3_S32_F3854', 'L1_S24_F1510', 'L1_S24_F1525', 'L2_S26_F3099'], ['Id', 'L3_S30_D3496', 'L3_S30_D3506', 'L3_S30_D3501', 'L3_S30_D3516', 'L3_S30_D3511', 'L3_S32_D3852', 'L3_S33_D3858', 'L3_S34_D3875', 'L3_S29_D3316'], ['Id', 'L1_S24_F1846', 'L3_S32_F3850', 'L1_S24_F1695', 'L1_S24_F1632', 'L3_S33_F3855', 'L1_S24_F1604', 'L3_S29_F3407', 'L3_S33_F3865', 'L3_S38_F3952', 'L1_S24_F1723', 'L3_S33_F3861', 'L3_S33_F3857', 'L3_S33_F3859', 'L3_S34_F3876', 'L3_S29_F3461', 'Response']] traindata = None testdata = None for i, f in enumerate(trainfiles): print(f) subset = None for i, chunk in enumerate(pd.read_csv(directory + f, usecols=cols[i], chunksize=50000, low_memory=False)): print(i) if subset is None: subset = chunk.copy() else: subset = pd.concat([subset, chunk]) del chunk gc.collect() if traindata is None: traindata = subset.copy() else: traindata = pd.merge(traindata, subset.copy(), on=""Id"") del subset gc.collect() del cols[2][-1] # Test doesn't have response! for i, f in enumerate(testfiles): print(f) subset = None for i, chunk in enumerate(pd.read_csv(directory + f, usecols=cols[i], chunksize=50000, low_memory=False)): print(i) if subset is None: subset = chunk.copy() else: subset = pd.concat([subset, chunk]) del chunk gc.collect() if testdata is None: testdata = subset.copy() else: testdata = pd.merge(testdata, subset.copy(), on=""Id"") del subset gc.collect() traindata = traindata.merge(df_mindate, on='Id') testdata = testdata.merge(df_mindate, on='Id') testdata['Response'] = 0 # Add Dummy Value visibletraindata = traindata[::2] blindtraindata = traindata[1::2] print(blindtraindata.columns) for i in range(2): for col in cols[i][1:]: print(col) blindtraindata.loc[:, col] = LeaveOneOut(visibletraindata, blindtraindata, col, False).values testdata.loc[:, col] = LeaveOneOut(visibletraindata, testdata, col, False).values del visibletraindata gc.collect() testdata.drop('Response', inplace=True, axis=1) return blindtraindata, testdata def Train(): train, test = GrabData() print('Train:', train.shape) print('Test', test.shape) features = list(train.columns) features.remove('Response') features.remove('Id') print(features) num_rounds = 50 params = {} params['objective'] = ""binary:logistic"" params['eta'] = 0.021 params['max_depth'] = 7 params['colsample_bytree'] = 0.82 params['min_child_weight'] = 3 params['base_score'] = 0.005 params['silent'] = True print('Fitting') trainpredictions = None testpredictions = None dvisibletrain = \\ xgb.DMatrix(train[features], train.Response, silent=True) dtest = \\ xgb.DMatrix(test[features], silent=True) folds = 1 for i in range(folds): print('Fold:', i) params['seed'] = i watchlist = [(dvisibletrain, 'train'), (dvisibletrain, 'val')] clf = xgb.train(params, dvisibletrain, num_boost_round=num_rounds, evals=watchlist, early_stopping_rounds=20, feval=mcc_eval, maximize=True ) limit = clf.best_iteration+1 # limit = clf.best_ntree_limit predictions = \\ clf.predict(dvisibletrain, ntree_limit=limit) best_proba, best_mcc, y_pred = eval_mcc(train.Response, predictions, True) print('tree limit:', limit) print('mcc:', best_mcc) print(matthews_corrcoef(train.Response, y_pred)) if(trainpredictions is None): trainpredictions = predictions else: trainpredictions += predictions predictions = clf.predict(dtest, ntree_limit=limit) if(testpredictions is None): testpredictions = predictions else: testpredictions += predictions imp = get_importance(clf, features) print('Importance array: ', imp) best_proba, best_mcc, y_pred = eval_mcc(train.Response, trainpredictions/folds, True) print(matthews_corrcoef(train.Response, y_pred)) submission = pd.DataFrame({""Id"": train.Id, ""Prediction"": trainpredictions/folds, ""Response"": train.Response}) submission[['Id', 'Prediction', 'Response']].to_csv('rawtrainxgbsubmission'+str(folds)+'.csv', index=False) submission = pd.DataFrame({""Id"": test.Id.values, ""Response"": testpredictions/folds}) submission[['Id', 'Response']].to_csv('rawxgbsubmission'+str(folds)+'.csv', index=False) y_pred = (testpredictions/folds > .22).astype(int) submission = pd.DataFrame({""Id"": test.Id.values, ""Response"": y_pred}) submission[['Id', 'Response']].to_csv('xgbsubmission'+str(folds)+'.csv', index=False) if __name__ == ""__main__"": print('Started') Train() print('Finished') '",No,5,53.0 "date = pd.read_csv('../input/bosch-production-line-performance/train_date.csv.zip', nrows=10000) numeric = pd.read_csv('../input/bosch-production-line-performance/train_numeric.csv.zip', nrows=10000) category = pd.read_csv('../input/bosch-production-line-performance/train_categorical.csv.zip', nrows=10000)",No,5,45.0 date,No,3,41.0 numeric,No,3,41.0 category,No,3,41.0 "num_feats = ['Id', 'L3_S30_F3514', 'L0_S9_F200', 'L3_S29_F3430', 'L0_S11_F314', 'L0_S0_F18', 'L3_S35_F3896', 'L0_S12_F350', 'L3_S36_F3918', 'L0_S0_F20', 'L3_S30_F3684', 'L1_S24_F1632', 'L0_S2_F48', 'L3_S29_F3345', 'L0_S18_F449', 'L0_S21_F497', 'L3_S29_F3433', 'L3_S30_F3764', 'L0_S1_F24', 'L3_S30_F3554', 'L0_S11_F322', 'L3_S30_F3564', 'L3_S29_F3327', 'L0_S2_F36', 'L0_S9_F180', 'L3_S33_F3855', 'L0_S0_F4', 'L0_S21_F477', 'L0_S5_F114', 'L0_S6_F122', 'L1_S24_F1122', 'L0_S9_F165', 'L0_S18_F439', 'L1_S24_F1490', 'L0_S6_F132', 'L3_S29_F3379', 'L3_S29_F3336', 'L0_S3_F80', 'L3_S30_F3749', 'L1_S24_F1763', 'L0_S10_F219', 'Response']",No,4,77.0 "minmaxfeatures.sort_values(by=['mindate', 'Id'], inplace=True) minmaxfeatures['min_Id_rev'] = -minmaxfeatures.Id.diff().shift(-1) minmaxfeatures['min_Id'] = minmaxfeatures.Id.diff()",No,2,12.0 "cols = [['Id']+date_cols,num_feats]",No,5,77.0 "traindata = None testdata = None",No,5,77.0 "trainfiles = ['train_date.csv.zip','train_numeric.csv.zip'] testfiles = ['test_date.csv.zip','test_numeric.csv.zip']",No,5,77.0 "traindata = traindata.merge(minmaxfeatures, on='Id') traindata = traindata.merge(data, on='Id') testdata = testdata.merge(minmaxfeatures, on='Id') testdata = testdata.merge(data, on='Id')",No,5,32.0 "del minmaxfeatures,data gc.collect()",Yes,4,10.0 "train = traindata[::2] valid = traindata[1::2]",No,5,13.0 "del traindata gc.collect()",Yes,4,10.0 "def mcc(tp, tn, fp, fn): num = tp * tn - fp * fn den = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn) if den == 0: return 0 else: return num / np.sqrt(den)",No,4,49.0 "def eval_mcc(y_true, y_prob): idx = np.argsort(y_prob) y_true_sort = y_true[idx] n = y_true.shape[0] nump = 1.0 * np.sum(y_true) numn = n - nump tp,fp = nump,numn tn,fn = 0.0,0.0 best_mcc = 0.0 best_id = -1 mccs = np.zeros(n) for i in range(n): if y_true_sort[i] == 1: tp -= 1.0 fn += 1.0 else: fp -= 1.0 tn += 1.0 new_mcc = mcc(tp, tn, fp, fn) mccs[i] = new_mcc if new_mcc >= best_mcc: best_mcc = new_mcc best_id = i return best_mcc",No,3,49.0 "def mcc_eval(y_prob, dtrain): y_true = dtrain.get_label() best_mcc = eval_mcc(y_true, y_prob) return 'MCC', best_mcc",No,3,49.0 "import xgboost as xgb params = {'objective':""binary:logistic"", 'max_depth':25, 'base_score':0.005, 'eval_metric':'auc', 'n_jobs':-1 }'",Yes,5,59.0 "trainm = xgb.DMatrix(train.drop(['Response','Id'],axis=1),train['Response']) validm = xgb.DMatrix(valid.drop(['Response','Id'],axis=1),valid['Response']) test = xgb.DMatrix(testdata.drop(['Id'],axis=1))",No,5,21.0 "watchlist = [(trainm, 'train'), (validm, 'val')] clf = xgb.train(params, trainm, num_boost_round=100, evals=watchlist, early_stopping_rounds=20, feval=mcc_eval, maximize=True )",Yes,5,7.0 predictions = clf.predict(validm),No,5,48.0 "fig, ax = plt.subplots(figsize=(12,18)) xgb.plot_importance(clf,ax=ax)",No,5,79.0 test = clf.predict(test),No,5,48.0 "testdata['Response'] = (test>best_prob).astype(int) testdata[['Id','Response']].to_csv(""submitwoId.csv"",index=False)'",Yes,5,25.0 "import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import tqdm import gc import sys import warnings warnings.filterwarnings(""ignore"")",No,5,23.0 "data = data[['Id','start_station','end_station']] usefuldatefeatures = ['Id']+date_cols",No,4,77.0 "minmaxfeatures = None for chunk in pd.read_csv('../input/bosch-production-line-performance/train_date.csv.zip',usecols=usefuldatefeatures,chunksize=50000,low_memory=False): features = chunk.columns.values.tolist() features.remove('Id') df_mindate_chunk = chunk[['Id']].copy() df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values df_mindate_chunk['maxdate'] = chunk[features].max(axis=1).values df_mindate_chunk['min_time_station'] = chunk[features].idxmin(axis = 1).apply(lambda s: int(s.split('_')[1][1:]) if s is not np.nan else -1) df_mindate_chunk['max_time_station'] = chunk[features].idxmax(axis = 1).apply(lambda s: int(s.split('_')[1][1:]) if s is not np.nan else -1) minmaxfeatures = pd.concat([minmaxfeatures, df_mindate_chunk]) del chunk gc.collect()",Yes,3,8.0 "for chunk in pd.read_csv('../input/bosch-production-line-performance/test_date.csv.zip',usecols=usefuldatefeatures,chunksize=50000,low_memory=False): features = chunk.columns.values.tolist() features.remove('Id') df_mindate_chunk = chunk[['Id']].copy() df_mindate_chunk['mindate'] = chunk[features].min(axis=1).values df_mindate_chunk['maxdate'] = chunk[features].max(axis=1).values df_mindate_chunk['min_time_station'] = chunk[features].idxmin(axis = 1).apply(lambda s: int(s.split('_')[1][1:]) if s is not np.nan else -1) df_mindate_chunk['max_time_station'] = chunk[features].idxmax(axis = 1).apply(lambda s: int(s.split('_')[1][1:]) if s is not np.nan else -1) minmaxfeatures = pd.concat([minmaxfeatures, df_mindate_chunk]) del chunk gc.collect()",Yes,3,8.0 traindata,No,5,41.0 testdata,No,5,41.0 "traindata.fillna(value=0,inplace=True) testdata.fillna(value=0,inplace=True)",No,5,17.0 "model = RandomForestClassifier(n_estimators=500,n_jobs=-1,verbose=1,random_state=11) model.fit(total.drop(['Response','Id'],axis=1),total['Response'])",Yes,5,7.0 "test = model.predict(testdata.drop(['Id'],axis=1))",No,5,48.0 "testdata['Response'] = test testdata[['Id','Response']].to_csv(""submit.csv"",index=False)'",No,5,25.0 total,No,5,41.0 "import numpy as np from sklearn.ensemble import RandomForestClassifier import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from sklearn import preprocessing import os print(os.listdir(""../input"")) from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn import tree from sklearn import svm from sklearn.ensemble import AdaBoostClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn import ensemble from sklearn.metrics import accuracy_score",Yes,5,88.0 "%%time event_type=pd.read_csv(""../input/event_type.csv"",error_bad_lines=False) train = pd.read_csv(""../input/train.csv"") severity_type = pd.read_csv(""../input/severity_type.csv"") log_feature = pd.read_csv(""../input/log_feature.csv"") test = pd.read_csv(""../input/test.csv"") resource_type = pd.read_csv(""../input/resource_type.csv"",error_bad_lines=False) sample_submission = pd.read_csv(""../input/sample_submission.csv"")",No,5,45.0 "print(""test"",test.shape) print(""train"",train.shape)",No,5,58.0 "print('test',test.head()) print('train',train.head(4)) print('sample_submission',sample_submission.head()) print('event_type',event_type.shape,event_type.head(2)) print('severity_type',severity_type.shape,severity_type.head(2)) print('log_feature',log_feature.shape,log_feature.head(2)) print('resource_type',resource_type.shape,resource_type.head(2))",No,5,41.0 "event_type['id']=pd.to_numeric(event_type['id'],errors='coerce') #converting object datatype into numeric",No,5,16.0 event_type.dtypes,No,5,70.0 "def merge_fn(df1,df2,col_name,how_param): merged_df=df1.merge(df2,how=how_param,on=col_name) return merged_df ",No,5,32.0 "train_merge1=merge_fn(train,event_type.drop_duplicates(subset=['id']),'id','left') train_merge2=merge_fn(train_merge1,severity_type.drop_duplicates(subset=['id']),'id','left') train_merge3=merge_fn(train_merge2,log_feature.drop_duplicates(subset=['id']),'id','left') train_merge4=merge_fn(train_merge3,resource_type.drop_duplicates(subset=['id']),'id','left')",No,5,32.0 train_merge4.shape,No,5,58.0 train_merge4.head(),No,5,41.0 train_merge4.dtypes,No,5,70.0 train_merge4.isnull().sum(),No,5,39.0 cat_col=list(set(train_merge4.columns)-set(train_merge4._get_numeric_data().columns)),No,5,77.0 "train_merge4=categorical_conversion(train_merge4,cat_col) ",No,5,16.0 "def label_encoding_conversion(df,cat_col): le=preprocessing.LabelEncoder() for i in range(len(cat_col)): df[cat_col[i]]=le.fit_transform(df[cat_col[i]]) return df",No,5,20.0 train_merge4.columns,No,5,71.0 "train_merge4=label_encoding_conversion(train_merge4,cat_col)",No,5,20.0 "train_merge4.drop(['id'],axis=1,inplace=True)",No,5,10.0 target=train_merge4[['fault_severity']],No,5,21.0 "train_merge4.drop(['fault_severity'],axis=1,inplace=True)",No,5,10.0 "test_merge1=merge_fn(test,event_type.drop_duplicates(subset=['id']),'id','left') test_merge2=merge_fn(test_merge1,severity_type.drop_duplicates(subset=['id']),'id','left') test_merge3=merge_fn(test_merge2,log_feature.drop_duplicates(subset=['id']),'id','left') test_merge4=merge_fn(test_merge3,resource_type.drop_duplicates(subset=['id']),'id','left')",No,5,32.0 test_merge4.shape,No,5,58.0 severity_type.head(),No,5,41.0 test_merge4.head(2),No,5,41.0 cat_col,No,5,53.0 "test_merge4=label_encoding_conversion(test_merge4,cat_col)",No,5,20.0 test_merge4.dtypes,No,5,70.0 "test_merge4.drop(['id'],axis=1,inplace=True)",No,5,10.0 test_merge4.columns,No,5,71.0 "lr=LogisticRegression() lr.fit(train_merge4,target) lr_pred=lr.predict(test_merge4) accuracy_score(pd.DataFrame(lr.predict(train_merge4)),target)",No,3,7.0 "rf=RandomForestClassifier() rf.fit(train_merge4,target) rf_pred=rf.predict(test_merge4) accuracy_score(pd.DataFrame(rf.predict(train_merge4)),target)",No,3,71.0 " nb=GaussianNB() nb.fit(train_merge4,target) nb.predict(test_merge4) accuracy_score(pd.DataFrame(nb.predict(train_merge4)),target)",Yes,3,7.0 " dt=tree.DecisionTreeClassifier() dt.fit(train_merge4,target) dt.predict(test_merge4) accuracy_score(pd.DataFrame(dt.predict(train_merge4)),target)",Yes,3,7.0 " svc_ml=svm.SVC() svc_ml.fit(train_merge4,target) svc_ml.predict(test_merge4) accuracy_score(pd.DataFrame(svc_ml.predict(train_merge4)),target)",Yes,3,7.0 " ada=AdaBoostClassifier() ada.fit(train_merge4,target) ada.predict(test_merge4) accuracy_score(pd.DataFrame(ada.predict(train_merge4)),target)",Yes,3,7.0 " knn=KNeighborsClassifier() knn.fit(train_merge4,target) knn.predict(test_merge4) accuracy_score(pd.DataFrame(knn.predict(train_merge4)),target)",Yes,3,7.0 " gb=ensemble.GradientBoostingClassifier() gb.fit(train_merge4,target) gb_pre=gb.predict(test_merge4) accuracy_score(pd.DataFrame(gb.predict(train_merge4)),target)",Yes,3,7.0 "list1=[] tuple_l=() def data_modeling(X,target,model): for i in range(len(model)): ml=model[i] ml.fit(X,target) pred=ml.predict(X) acc_score=accuracy_score(pd.DataFrame(ml.predict(X)),target) tuple_l=(ml.__class__.__name__,acc_score) list1.append(tuple_l) print(tuple_l) return list1 model_score_output=data_modeling(train_merge4,target,[AdaBoostClassifier(),KNeighborsClassifier(), svm.SVC(),RandomForestClassifier(), tree.DecisionTreeClassifier(), GaussianNB(), LogisticRegression(), ensemble.GradientBoostingClassifier()])",Yes,2,7.0 "modelscore_df=pd.DataFrame(model_score_output,columns=['Classifier',""Accuracy score""])'",Yes,5,12.0 modelscore_df,Yes,5,41.0 modelscore_df['classifier code']=np.arange(8),No,5,8.0 "modelscore_df.plot.bar(x='classifier code', y='Accuracy score', rot=0)",No,3,12.0 "predict_test=rf.predict_proba(test_merge4) pred_df=pd.DataFrame(predict_test,columns=['predict_0', 'predict_1', 'predict_2']) submission=pd.concat([test[['id']],pred_df],axis=1) submission.to_csv('sub.csv',index=False,header=True) ",No,4,25.0 "import pandas as pd train = pd.read_csv(""../input/train_users.csv"") test = pd.read_csv(""../input/test_users.csv"") sessions = pd.read_csv(""../input/sessions.csv"")",No,5,45.0 "for data in (train, test): data['year_created'] = data['date_account_created'].apply(lambda x: 'nan' if str(x) == 'nan' else int(str(x)[:4]) ) data['month_created'] = data['date_account_created'].apply(lambda x: 'nan' if str(x) == 'nan' else int(str(x)[5:7])) data['week_created'] = data['date_account_created'].apply(lambda x: 'nan' if str(x) == 'nan' else int(str(x)[8:10])) data['year_first'] = data['date_first_booking'].apply(lambda x: 'nan' if str(x) == 'nan' else int(str(x)[:4])) data['month_first'] = data['date_first_booking'].apply(lambda x: 'nan' if str(x) == 'nan' else int(str(x)[5:7])) data['week_first'] = data['date_first_booking'].apply(lambda x: 'nan' if str(x) == 'nan' else int(str(x)[8:10])) ",No,5,8.0 "for data in (train, test): data.drop(['date_account_created'], axis=1,inplace=True) data.drop(['date_first_booking'], axis=1,inplace=True) train.head()",Yes,5,10.0 "train = pd.merge(train, sessions, how=""left"", left_on=[""id""], right_on=[""user_id""]) test = pd.merge(test, sessions, how=""left"", left_on=[""id""], right_on=[""user_id""])",No,5,32.0 "countries = pd.read_csv(""../input/countries.csv"") countries.head()",Yes,4,45.0 "ga_train['trainrow'] = np.arange(ga_train.shape[0]) ga_test['testrow'] = np.arange(ga_test.shape[0])",No,4,8.0 "b""# brandtransforminverse_transformbrand\nbrand_encoder = LabelEncoder().fit(device_brand['phone_brand'])\ndevice_brand['brand'] = brand_encoder.transform(device_brand['phone_brand'])\nga_train['brand'] = device_brand['brand'] # device_iddevice_id\nga_test['brand'] = device_brand['brand']\n# devicetrainrow,brand1devicebrand\nXtr_brand = csr_matrix((np.ones(ga_train.shape[0]), (ga_train['trainrow'], ga_train['brand'])))\nXte_brand = csr_matrix((np.ones(ga_test.shape[0]), (ga_test['testrow'], ga_test['brand'])))\nprint(Xtr_brand.shape, Xte_brand.shape)""",Yes,4,20.0 "m = device_brand.phone_brand.str.cat(device_brand.device_model) modelencoder = LabelEncoder().fit(m) device_brand['model'] = modelencoder.transform(m) ga_train['model'] = device_brand['model'] ga_test['model'] = device_brand['model'] Xtr_model = csr_matrix((np.ones(ga_train.shape[0]), (ga_train.trainrow, ga_train.model))) Xte_model = csr_matrix((np.ones(ga_test.shape[0]), (ga_test.testrow, ga_test.model))) print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))",Yes,4,20.0 "b""# app\napp_encoder = LabelEncoder().fit(app_events['app_id'])\napp_events['app'] = app_encoder.transform(app_events['app_id'])""",No,5,20.0 "b""napps = len(app_encoder.classes_)\n# devicetrainrow/testrow,app1deviceapp\nd = device_apps.dropna(subset=['trainrow']) # trainrowtestrowNaN\nXtr_app = csr_matrix((np.ones(d.shape[0]), (d['trainrow'], d['app'])), shape=[ga_train.shape[0],napps])\nd = device_apps.dropna(subset=['testrow']) \nXte_app = csr_matrix((np.ones(d.shape[0]), (d['testrow'], d['app'])), shape=[ga_test.shape[0],napps])\n# appdevicebrand\nprint(Xtr_app.shape, Xte_app.shape)""",No,3,17.0 "b""# appapp_labels\n# app_labelsappevents\napp_labels = app_labels.loc[app_labels.app_id.isin(app_events.app_id.unique())]\napp_labels['app'] = app_encoder.transform(app_labels['app_id'])\n# label\nlabel_encoder = LabelEncoder().fit(app_labels['label_id'])\napp_labels['label'] = label_encoder.transform(app_labels['label_id'])""",No,4,20.0 "b""nlabels = len(label_encoder.classes_) # csr_matrixshape\nd = device_labels.dropna(subset=['trainrow'])\nXtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), shape=(ga_train.shape[0],nlabels))\nd = device_labels.dropna(subset=['testrow'])\nXte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), shape=(ga_test.shape[0],nlabels))\nprint(Xtr_label.shape, Xte_label.shape)""",No,3,17.0 "Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr') Xtest = hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr') print(Xtrain.shape, Xtest.shape)",No,4,11.0 "target_encoder = LabelEncoder().fit(ga_train['group']) y = target_encoder.transform(ga_train['group']) nclasses = len(target_encoder.classes_) #app_labels",No,5,20.0 "from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestClassifier rdf = RandomForestClassifier(n_estimators=50, max_depth=None, min_samples_split=2, random_state=0)",No,5,4.0 "rdf.fit(Xtrain[:70000], y[:70000]) pred = rdf.predict_proba(Xtrain[70001:]) log_loss(y[70001:], pred)",No,3,28.0 "pred_rdf = rdf.predict(Xtrain[70001:]) np.mean(pred_rdf==y[70001:])",No,4,28.0 "pred = pd.DataFrame(rdf.predict_proba(Xtrain[70001:]), index=ga_train.iloc[70001:].index, columns=target_encoder.classes_) pred.head()",No,3,12.0 "predgroup = pd.DataFrame(y[70001:], index=ga_train.iloc[70001:].index) predgroup.head()",No,4,12.0 "files = [ 'countries', 'age_gender_bkts', 'test_users', 'train_users', 'sessions' ] data = {} for f in files: data[f] = pd.read_csv('../input/' + f + '.csv')",No,4,45.0 "results.to_csv('dummy_results.csv', index=False)",No,5,25.0 "# Imports # pandas import pandas as pd from pandas import Series,DataFrame # numpy, matplotlib, seaborn import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set_style('whitegrid') %matplotlib inline # machine learning from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn import cross_validation import xgboost as xgb ### set-up some custom functions for below def get_year(date): if date == date: return int(str(date)[:4]) return date def get_month(date): if date == date: return int(str(date)[5:7]) return date def language_bucket(dataset): if dataset['language'] == 'en': val = 'en' else: val = 'non-en' return val ",No,3,22.0 "#### import the data train_users = pd.read_csv('../input/train_users.csv') test_users = pd.read_csv('../input/test_users.csv') gender = pd.read_csv('../input/age_gender_bkts.csv') sessions = pd.read_csv('../input/sessions.csv') countries = pd.read_csv('../input/countries.csv') all_users = pd.concat((train_users, test_users), axis=0, ignore_index=True) ",No,4,45.0 "##### the age variable has a few missing values.. let's go ahead and put the average in for these average_age = train_users[""age""].mean() train_users[""age""][np.isnan(train_users[""age""])] = average_age test_users[""age""][np.isnan(test_users[""age""])] = average_age'",No,5,17.0 "#### looking at the country distribution through a couple of variables fig, (axis1, axis2, axis3, axis4, axis5, axis6) = plt.subplots(6,1,figsize=(15,30)) sns.countplot(x='country_destination', data=train_users, palette=""husl"", ax=axis1) sns.countplot(x='signup_flow', hue = ""country_destination"", data=train_users, palette=""husl"", ax=axis2) sns.countplot(x='affiliate_channel', hue = ""country_destination"", data=train_users, palette=""husl"", ax=axis3) sns.countplot(x='age_range', hue = ""country_destination"", data=train_users, palette=""husl"", ax=axis4) sns.countplot(x='signup_year', hue = ""country_destination"", data=train_users, palette=""husl"", ax=axis5) sns.countplot(x='language_bucket', hue = ""country_destination"", data=train_users, palette=""husl"", ax=axis6) '",No,5,33.0 "######## need to change the format of our variables so we can use the algo # signup_method train_users[""signup_method""] = (train_users[""signup_method""] == ""basic"").astype(int) test_users[""signup_method""] = (test_users[""signup_method""] == ""basic"").astype(int) # signup_flow train_users[""signup_flow""] = (train_users[""signup_flow""] == 3).astype(int) test_users[""signup_flow""] = (test_users[""signup_flow""] == 3).astype(int) # language train_users[""language""] = (train_users[""language""] == 'en').astype(int) test_users[""language""] = (test_users[""language""] == 'en').astype(int) # affiliate_channel train_users[""affiliate_channel""] = (train_users[""affiliate_channel""] == 'direct').astype(int) test_users[""affiliate_channel""] = (test_users[""affiliate_channel""] == 'direct').astype(int) # affiliate_provider train_users[""affiliate_provider""] = (train_users[""affiliate_provider""] == 'direct').astype(int) test_users[""affiliate_provider""] = (test_users[""affiliate_provider""] == 'direct').astype(int)'",No,5,16.0 "#### clense the data of non-numeric values from sklearn import preprocessing for f in train_users.columns: if f == ""country_destination"" or f == ""id"": continue if train_users[f].dtype == 'object': lbl = preprocessing.LabelEncoder() lbl.fit(np.unique(list(train_users[f].values) + list(test_users[f].values))) train_users[f] = lbl.transform(list(train_users[f].values)) test_users[f] = lbl.transform(list(test_users[f].values)) ##In'",No,5,20.0 "# define training and testing sets X_train = train_users.drop([""country_destination"", ""id"", 'booked', 'age_range'],axis=1) Y_train = train_users[""country_destination""] X_test = test_users.drop(['id', 'age_range'],axis=1).copy() ##In'",No,5,21.0 "# modify country_destination to numerical values country_num_dic = {'NDF': 0, 'US': 1, 'other': 2, 'FR': 3, 'IT': 4, 'GB': 5, 'ES': 6, 'CA': 7, 'DE': 8, 'NL': 9, 'AU': 10, 'PT': 11} num_country_dic = {y:x for x,y in country_num_dic.items()} Y_train = Y_train.map(country_num_dic)",No,5,20.0 "### Xgboost params = {""objective"": ""multi:softmax"", ""num_class"": 12} T_train_xgb = xgb.DMatrix(X_train, Y_train) X_test_xgb = xgb.DMatrix(X_test) gbm = xgb.train(params, T_train_xgb, 20) Y_pred = gbm.predict(X_test_xgb)",No,3,20.0 "# Create submission country_df = pd.DataFrame({ ""id"": test_users[""id""], ""country"": Y_pred }) submission = DataFrame(columns=[""id"", ""country""]) # sort countries according to most probable destination country for key in country_df['country'].value_counts().index: submission = pd.concat([submission, country_df[country_df[""country""] == key]], ignore_index=True) ####submission.to_csv('airbnb.csv', index=False)'",No,5,55.0 "##### add ndf to everyone ndf_only = pd.DataFrame(test_users['id']) ndf_only['country'] = 'NDF' ##submission_final = pd.concat([submission, ndf_only]) ndf_only.to_csv('airbnb.csv', index=False)",No,4,25.0 "###### uh are the previous submissions formatted incorrectly or something? ######## checking via baseline submission result = [] for index, row in test_users.iterrows(): if isinstance(row['date_first_booking'], float): result.append([row['id'], 'NDF']) result.append([row['id'], 'US']) result.append([row['id'], 'other']) result.append([row['id'], 'FR']) result.append([row['id'], 'IT']) else: result.append([row['id'], 'US']) result.append([row['id'], 'other']) result.append([row['id'], 'FR']) result.append([row['id'], 'IT']) result.append([row['id'], 'GB']) pd.DataFrame(result).to_csv('sub.csv', index = False, header = ['id', 'country'])",No,4,55.0 "##result results = pd.DataFrame(result) results.columns = ['id', 'country'] results[results['id'] == 'qe9gwamyfk']",No,5,55.0 "# Imports # pandas import pandas as pd from pandas import Series,DataFrame # numpy, matplotlib, seaborn, sklearn import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.ensemble import RandomForestClassifier sns.set_style('whitegrid') ##%matplotlib inline #### import the data train_users = pd.read_csv('../input/train_users_2.csv') test_users = pd.read_csv('../input/test_users.csv') gender = pd.read_csv('../input/age_gender_bkts.csv') sessions = pd.read_csv('../input/sessions.csv') countries = pd.read_csv('../input/countries.csv') ##all_users = pd.concat((train_users, test_users), axis=0, ignore_index=True) mobile_browsers = [] for x in train_users['first_browser'].unique(): if 'Mobile' in x: mobile_browsers.append(x) else: pass major_browsers = ['IE', 'Safari', '-unknown- ', 'Chrome', 'Firefox', 'Mobile'] ### group up those first_browsers train_users['first_browser_grouped'] = np.where(train_users['first_browser'].isin(mobile_browsers), 'Mobile', train_users['first_browser']) train_users['first_browser_grouped'] = np.where(train_users['first_browser_grouped'].isin(major_browsers), train_users['first_browser_grouped'], 'Other') ### find year of account creation #train_users['year_account_creation'] = pd.DatetimeIndex(train_users['date_account_created']).year ### group up the first_device_type dict_first_device_type = {""Mac Desktop"": ""Desktop"", ""Windows Desktop"": ""Desktop"", ""Desktop (Other)"": ""Desktop"", ""iPhone"": ""Phone/Pad"", ""iPad"": ""Phone/Pad"", ""Android Tablet"": ""Phone/Pad"", ""Android Phone"": ""Phone/Pad"", ""SmartPhone (Other)"": ""Phone/Pad""} train_users = train_users.replace({""first_device_type"": dict_first_device_type}) ######### apply the above adjustments to the test dataset test_users['first_browser_grouped'] = np.where(test_users['first_browser'].isin(mobile_browsers), 'Mobile', test_users['first_browser']) test_users['first_browser_grouped'] = np.where(test_users['first_browser_grouped'].isin(major_browsers), test_users['first_browser_grouped'], 'Other') #test_users['year_account_creation'] = pd.DatetimeIndex(test_users['date_account_created']).year test_users = test_users.replace({""first_device_type"": dict_first_device_type}) '",No,3,22.0 "language_distance = {'language' : ['en', 'du', 'fr', 'es'], 'levenshtein_distance_from_en' : [0, 72.61, 92.06, 92.25]} language_distance = pd.DataFrame(language_distance) train_users = pd.merge(train_users, language_distance, on = 'language', how = 'left') test_users = pd.merge(test_users, language_distance, on = 'language', how = 'left') ",No,5,32.0 " ########## fill in the missing values train_users['levenshtein_distance_from_en'].fillna(-1) test_users['levenshtein_distance_from_en'].fillna(-1)",No,5,17.0 "##train_users['year_account_creation'] = pd.DatetimeIndex(train_users['date_account_created']).year train_users['timestamp_first_active'] = train_users['timestamp_first_active'].astype(str) train_users['date_account_created'] = pd.to_datetime(train_users['date_account_created']) #### converting the first active day to a date-time var train_users['timestamp_first_active_day'] = train_users['timestamp_first_active'].str[:8] train_users['timestamp_first_active_day'] = pd.to_datetime(train_users['timestamp_first_active_day'], format='%Y%m%d') #### find the first active year train_users['timestamp_first_active_year'] = train_users['timestamp_first_active'].str[:4] train_users['timestamp_first_active_hour'] = train_users['timestamp_first_active'].str[8:10] #### create a var to see if they searched before joining #train_users['searched_before_joining'] = (train_users['timestamp_first_active_day'] < train_users['date_account_created']) #train_users['searched_before_joining'] = train_users['searched_before_joining'] * 1 #### did they do a previous trip? This appears to be a weird variable.. ##train_users['first_trip'] = pd.isnull(train_users['date_first_booking']) * 1 major_languages = ['en'] train_users['language_bucket'] = np.where(train_users['language'].isin(major_languages), 'en', 'other') ##### group up the age variable labels = [1, 2, 3, 4, 5, 6, 7] bins = [0, 20, 30, 40, 50, 60, 9000, 100000] train_users['age'].fillna(10000) train_users['age_group'] = pd.cut(train_users['age'], bins, right=False, labels=labels) train_users['age_group'] = train_users['age_group'] * 1 train_users[""signup_combo""] = train_users[""signup_method""].map(str) + train_users[""signup_flow""].map(str) ##### let's group the affiliate_provider variable major_affiliate_providers = ['direct', 'google', 'bing', 'craigslist', 'facebook'] train_users['affiliate_provider_grp'] = np.where(train_users['affiliate_provider'].isin(major_affiliate_providers), train_users['affiliate_provider'], 'other') train_users[""affiliate_combined""] = train_users[""affiliate_provider_grp""].map(str) + train_users[""affiliate_channel""].map(str) ###### adjust test so it matches the adjustments made to the train dataset test_users['timestamp_first_active'] = test_users['timestamp_first_active'].astype(str) test_users['date_account_created'] = pd.to_datetime(test_users['date_account_created']) test_users['timestamp_first_active_day'] = test_users['timestamp_first_active'].str[:8] test_users['timestamp_first_active_day'] = pd.to_datetime(test_users['timestamp_first_active_day'], format='%Y%m%d') test_users['timestamp_first_active_year'] = test_users['timestamp_first_active'].str[:4] #test_users['searched_before_joining'] = (test_users['timestamp_first_active_day'] < test_users['date_account_created']) #test_users['searched_before_joining'] = test_users['searched_before_joining'] * 1 ##test_users['first_trip'] = pd.isnull(test_users['date_first_booking']) * 1 test_users['language_bucket'] = np.where(test_users['language'].isin(major_languages), 'en', 'other') test_users['age'].fillna(10000) test_users['age_group'] = pd.cut(test_users['age'], bins, right=False, labels=labels) test_users['age_group'] = test_users['age_group'] * 1 test_users['timestamp_first_active_day'] = pd.to_datetime(test_users['timestamp_first_active_day'], format='%Y%m%d') test_users[""signup_combo""] = test_users[""signup_method""].map(str) + test_users[""signup_flow""].map(str) test_users['timestamp_first_active_hour'] = test_users['timestamp_first_active'].str[8:10] test_users['affiliate_provider_grp'] = np.where(test_users['affiliate_provider'].isin(major_affiliate_providers), test_users['affiliate_provider'], 'other') test_users[""affiliate_combined""] = test_users[""affiliate_provider_grp""].map(str) + test_users[""affiliate_channel""].map(str) '",No,3,8.0 "#np.any(np.isnan(train_users['id'])) #print(np.all(np.isfinite(col))) #np.isnan(train_users.any()) #np.isfinite(train_users.any()) #np.isnan(test_users.any()) #np.isfinite(test_users.any()) #train_users.head() X_train = train_users_imputed.drop(['signup_app', 'affiliate_provider', 'affiliate_channel', 'levenshtein_distance_from_en', 'month_year_first_active', 'month_year_created', 'year_first_active', 'timestamp_first_active_year', 'country_destination', 'id', 'first_browser', 'age', 'language'], axis=1) y_train = train_users_imputed['country_destination'] X_test = test_users_imputed.drop(['signup_app', 'affiliate_provider', 'affiliate_channel', 'levenshtein_distance_from_en', 'month_year_first_active', 'month_year_created', 'year_first_active', 'timestamp_first_active_year', 'id', 'age', 'first_browser', 'language'], axis = 1) ",No,3,21.0 " ###### look at variable importance in the model importances = clf.feature_importances_ indices = np.argsort(importances)[::-1] # Print the feature ranking print(""Feature ranking:"") for f in indices: print(X_train.columns[f], importances[f]) #print(""%d. feature %d (%f)"" % (f + 1, indices[f], importances[indices[f]])) #for x in range(X_train.shape[1]): # print(X_train.columns[x])",No,5,86.0 "### alright, if none of the entries for an id is NDF, then set the 5th obs == NDF no_ndf_ppl = top_5_records[~top_5_records['id'].isin(top_5_records['id'][top_5_records['variable'] == 'NDF'])] ndf_ppl = top_5_records[top_5_records['id'].isin(top_5_records['id'][top_5_records['variable'] == 'NDF'])] no_ndf_ppl_first = no_ndf_ppl.sort(['value'], ascending=[1]) no_ndf_ppl_first_ndf = no_ndf_ppl_first.groupby('id').head(1) no_ndf_ppl_first_ndf['variable'] = 'NDF' no_ndf_ppl_first_4 = no_ndf_ppl.sort(['value'], ascending=[0]) no_ndf_ppl_first_other = no_ndf_ppl_first_4.groupby('id').head(4) ##### combine all of the dataframes together result = pd.concat([no_ndf_ppl_first_ndf, no_ndf_ppl_first_other , ndf_ppl]) result = result.drop(['value'], axis = 1) result.columns = ['id', 'country'] #### create the final output dataframe final_output_adjusted = DataFrame(columns=['id', 'country']) final_output_adjusted = final_output_adjusted.append(result) #### convert to csv final_output_adjusted.to_csv('adjusted.csv', index = False, header = ['id', 'country']) ",No,2,17.0 "#Importing all dependencies import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder from sklearn.cross_validation import train_test_split from sklearn.preprocessing import StandardScaler #algorithms from xgboost.sklearn import XGBClassifier from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier ",No,5,22.0 "#Loading datasets data_dir = ""../input/"" train_users = pd.read_csv(data_dir + 'train_users_2.csv',parse_dates=['date_account_created','timestamp_first_active','date_first_booking'], index_col=False) sessions=pd.read_csv(data_dir + 'sessions.csv') test_users=pd.read_csv(data_dir + 'test_users.csv', parse_dates=['date_account_created','timestamp_first_active','date_first_booking'], index_col=False)'",No,5,45.0 "#Printing the columns print(train_users.columns) print(test_users.columns)",No,5,71.0 "#Mark test and train users and merge them for data preparation train_users['type']='Train' test_users['country_destination']='NULL' test_users['type']='Test' users = pd.concat([train_users, test_users], ignore_index=True)",No,5,11.0 "#verify the counts print('# of train users: ',train_users.id.count()) print('# of test users: ',test_users.id.count()) print('# of total users: ',train_users.id.count()+test_users.id.count()) print('# of users: ',users.id.count())",No,5,72.0 "def feature_engineering(data): #Date account created data['Day_Acct_Created'] = data['date_account_created'].dt.day data['Month_Acct_Created'] = data['date_account_created'].dt.month data['Year_Acct_Created'] = data['date_account_created'].dt.year data['Hour_Acct_Created'] = data['date_account_created'].dt.hour data['DayOfWeek_Acct_Created'] = data['date_account_created'].dt.dayofweek data['WeekOfYear_Acct_Created'] = data['date_account_created'].dt.weekofyear #Timestamp of first active data['Day_First_Active'] = data['timestamp_first_active'].dt.day data['Month_First_Active'] = data['timestamp_first_active'].dt.month data['Year_First_Active'] = data['timestamp_first_active'].dt.year data['Hour_First_Active'] = data['timestamp_first_active'].dt.hour data['DayOfWeek_First_Active'] = data['timestamp_first_active'].dt.dayofweek data['WeekOfYear_First_Active'] = data['timestamp_first_active'].dt.weekofyear #Date of first booking data['Day_First_Booking'] = data['date_first_booking'].dt.day data['Month_First_Booking'] = data['date_first_booking'].dt.month data['Year_First_Booking'] = data['date_first_booking'].dt.year data['Hour_First_Booking'] = data['date_first_booking'].dt.hour data['DayOfWeek_First_Booking'] = data['date_first_booking'].dt.dayofweek data['WeekOfYear_First_Booking'] = data['date_first_booking'].dt.weekofyear #Replace unknowns by NA data.gender.replace('-unknown-', np.nan, inplace=True) #Replace Ages data.loc[data.age > 95, 'age'] = np.nan data.loc[data.age < 13, 'age'] = np.nan #Converting categorical to numeric enc = LabelEncoder() #data['gender_cd'] = enc.fit_transform(data['gender']) data['signup_method_cd'] = enc.fit_transform(data['signup_method']) data['language_cd'] = enc.fit_transform(data['language']) data['affiliate_channel_cd'] = enc.fit_transform(data['affiliate_channel']) data['affiliate_provider_cd'] = enc.fit_transform(data['affiliate_provider']) #data['first_affiliate_tracked_cd'] = enc.fit_transform(data['first_affiliate_tracked']) data['signup_app_cd'] = enc.fit_transform(data['signup_app']) data['first_device_type_cd'] = enc.fit_transform(data['first_device_type']) data['first_browser_cd'] = enc.fit_transform(data['first_browser']) #Converting the target variable as it is in category category_encoder = LabelEncoder() category_encoder.fit(data['country_destination']) data['country_destination_cd'] = category_encoder.transform(data['country_destination']) #print(category_encoder.classes_) return data",No,4,8.0 temp=feature_engineering(users),No,5,8.0 "#Manual feature engineering #gender #Converting categorial to numeric temp.gender[temp.gender=='nan']='-1' temp.gender[temp.gender=='MALE']='0' temp.gender[temp.gender=='FEMALE']='1' temp.gender[temp.gender=='OTHER']='2' #first_affiliate_tracked temp.first_affiliate_tracked[temp.first_affiliate_tracked=='nan']='-1' temp.first_affiliate_tracked[temp.first_affiliate_tracked=='untracked']='0' temp.first_affiliate_tracked[temp.first_affiliate_tracked=='omg']='1' temp.first_affiliate_tracked[temp.first_affiliate_tracked=='linked']='2' temp.first_affiliate_tracked[temp.first_affiliate_tracked=='tracked-other']='3' temp.first_affiliate_tracked[temp.first_affiliate_tracked=='product']='4' temp.first_affiliate_tracked[temp.first_affiliate_tracked=='marketing']='5' temp.first_affiliate_tracked[temp.first_affiliate_tracked=='local ops']='6' temp = temp.fillna(-1)",No,5,20.0 "#Split train and test sets train=temp[temp['type']=='Train'] test=temp[temp['type']=='Test'] print(train.id.count(),test.id.count())",No,5,13.0 "#Creating train_xx and target_xx train_xx=train[[ 'Day_Acct_Created', 'Month_Acct_Created', 'Year_Acct_Created', 'Hour_Acct_Created', 'DayOfWeek_Acct_Created', 'WeekOfYear_Acct_Created', 'Day_First_Active', 'Month_First_Active', 'Year_First_Active', 'Hour_First_Active', 'DayOfWeek_First_Active', 'WeekOfYear_First_Active', 'Day_First_Booking', 'Month_First_Booking', 'Year_First_Booking', 'Hour_First_Booking', 'DayOfWeek_First_Booking', 'WeekOfYear_First_Booking', 'signup_method_cd', 'language_cd', 'affiliate_channel_cd', 'affiliate_provider_cd', 'signup_app_cd', 'first_device_type_cd', 'first_browser_cd','gender','age']] target_xx=train['country_destination_cd'] predict_xx=test[[ 'Day_Acct_Created', 'Month_Acct_Created', 'Year_Acct_Created', 'Hour_Acct_Created', 'DayOfWeek_Acct_Created', 'WeekOfYear_Acct_Created', 'Day_First_Active', 'Month_First_Active', 'Year_First_Active', 'Hour_First_Active', 'DayOfWeek_First_Active', 'WeekOfYear_First_Active', 'Day_First_Booking', 'Month_First_Booking', 'Year_First_Booking', 'Hour_First_Booking', 'DayOfWeek_First_Booking', 'WeekOfYear_First_Booking', 'signup_method_cd', 'language_cd', 'affiliate_channel_cd', 'affiliate_provider_cd', 'signup_app_cd', 'first_device_type_cd', 'first_browser_cd','gender','age']]",No,5,21.0 target_xx.head(),No,5,41.0 "#Splitting train and test X = train_xx y = target_xx X_test = predict_xx #Classifier xgb = RandomForestClassifier() xgb.fit(X, y) y_pred = xgb.predict_proba(X_test) ",Yes,4,48.0 "test['x']=y_pred output=test[['id','x']] ",No,5,55.0 output.head(),No,5,41.0 "%matplotlib inline import numpy as np import pandas as pd import datetime from sklearn.preprocessing import LabelEncoder from xgboost.sklearn import XGBClassifier import matplotlib.pyplot as plt",No,5,23.0 "#Loading data df_train_raw = pd.read_csv('../input/train_users_2.csv') df_test = pd.read_csv('../input/test_users.csv') labels = df_train_raw['country_destination'].values df_train = df_train_raw.drop(['country_destination'], axis=1) id_test = df_test['id'] piv_train = df_train.shape[0]",No,4,45.0 "#Creating a DataFrame with train+test data df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True) #Removing id and date_first_booking df_all = df_all.drop(['id', 'date_first_booking'], axis=1) #Filling nan df_all = df_all.fillna(-1) #####Feature engineering####### #date_account_created df_all.date_account_created = pd.to_datetime(df_all.date_account_created) df_all['dac_year'] = df_all.date_account_created.apply(lambda x: x.year) df_all['dac_month'] = df_all.date_account_created.apply(lambda x: x.month) df_all['dac_day'] = df_all.date_account_created.apply(lambda x: x.day) df_all['dac_weekday'] = df_all.date_account_created.apply(lambda x: x.weekday()) df_all['dac_week'] = df_all.date_account_created.apply(lambda x: x.week) df_all['dac_log_elapsed'] = np.log((datetime.date(2016, 1, 1) - df_all.date_account_created).astype('timedelta64[D]')) df_all = df_all.drop(['date_account_created'], axis=1) #timestamp_first_active df_all.timestamp_first_active = pd.to_datetime(df_all.timestamp_first_active, format='%Y%m%d%H%M%S') df_all['tfa_year'] = df_all.timestamp_first_active.apply(lambda x: x.year) df_all['tfa_month'] = df_all.timestamp_first_active.apply(lambda x: x.month) df_all['tfa_day'] = df_all.timestamp_first_active.apply(lambda x: x.day) df_all['tfa_weekday'] = df_all.timestamp_first_active.apply(lambda x: x.weekday()) df_all['tfa_week'] = df_all.timestamp_first_active.apply(lambda x: x.week) df_all['tfa_log_elapsed'] = np.log((datetime.date(2016, 1, 1) - df_all.timestamp_first_active).astype('timedelta64[D]')) df_all = df_all.drop(['timestamp_first_active'], axis=1) #Age av = df_all.age.values df_all['age'] = np.where(np.logical_or(av<14, av>90), -1, av) df_all['age_year'] = np.where(av > 1900, -1, av) #One-hot-encoding features ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser'] for f in ohe_feats: df_all_dummy = pd.get_dummies(df_all[f], prefix=f) df_all = df_all.drop([f], axis=1) df_all = pd.concat((df_all, df_all_dummy), axis=1) ",Yes,4,8.0 "#Splitting train and test vals = df_all.values X_train = vals[:piv_train] le = LabelEncoder() y_train = le.fit_transform(labels) X_test = vals[piv_train:]",No,3,13.0 "np.random.seed(42) samples = np.random.choice(piv_train, 50000) X_train = vals[samples] y_train = le.fit_transform(labels)[samples]",No,5,21.0 "#Python Modules import numpy as np import pandas as pd from sklearn.preprocessing import LabelEncoder import xgboost as xgb import operator",No,5,22.0 "# Loading data df_train = pd.read_csv('../input/train_users_2.csv') df_test = pd.read_csv('../input/test_users.csv') labels = df_train['country_destination'].values df_train = df_train.drop(['country_destination'], axis=1) id_test = df_test['id'] piv_train = df_train.shape[0]",No,3,45.0 "# Creating a DataFrame with train+test data df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True) #Removing id and date_first_booking df_all = df_all.drop(['id', 'date_first_booking'], axis=1) #Filling nan df_all = df_all.fillna(-1)",Yes,4,17.0 "# date_account_created dac = np.vstack( df_all.date_account_created.astype(str).apply( lambda x: list(map(int, x.split('-'))) ).values ) df_all['dac_year'] = dac[:,0] df_all['dac_month'] = dac[:,1] df_all['dac_day'] = dac[:,2] df_all = df_all.drop(['date_account_created'], axis=1)",No,4,11.0 "# timestamp_first_active tfa = np.vstack( df_all.timestamp_first_active.astype(str).apply( lambda x: list(map(int, [x[:4], x[4:6], x[6:8], x[8:10], x[10:12], x[12:14]])) ).values ) df_all['tfa_year'] = tfa[:,0] df_all['tfa_month'] = tfa[:,1] df_all['tfa_day'] = tfa[:,2] df_all = df_all.drop(['timestamp_first_active'], axis=1)",No,3,8.0 "# One-hot-encoding features ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser'] for f in ohe_feats: df_all_dummy = pd.get_dummies(df_all[f], prefix=f) df_all = df_all.drop([f], axis=1) df_all = pd.concat((df_all, df_all_dummy), axis=1)",No,5,20.0 "# Splitting train and test X = df_all.iloc[:piv_train,:] le = LabelEncoder() y = le.fit_transform(labels) X_test = df_all.iloc[piv_train:,:]",No,4,13.0 "# Classifier params = {'eta': 0.1, 'max_depth': 8, 'nround': 100, 'subsample': 0.7, 'colsample_bytree': 0.8, 'seed': 1, 'objective': 'multi:softprob', 'eval_metric':'ndcg', 'num_class': 12, 'nthread':3} num_boost_round = 10 dtrain = xgb.DMatrix(X, y) clf1 = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round)",No,5,7.0 "# Get feature scores and store in DataFrame importance = clf1.get_fscore() importance_df = pd.DataFrame( sorted(importance.items(), key=operator.itemgetter(1)), columns=['feature','fscore'] )",No,5,86.0 "# Plot feature importance of top 20 importance_df.iloc[-20:,:].plot(x='feature',y='fscore',kind='barh') # Only select features w/ a feature score (can also specify min fscore) # Retrain model with reduced feature set df_all = df_all[importance_df.feature.values] X = df_all.iloc[:piv_train,:] X_test = df_all.iloc[piv_train:,:] dtrain = xgb.DMatrix(X, y) clf2 = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round) y_pred = clf2.predict(xgb.DMatrix(X_test)).reshape(df_test.shape[0],12)",Yes,4,48.0 "# Taking the 5 classes with highest probabilities ids = [] #list of ids cts = [] #list of countries for i in range(len(id_test)): idx = id_test[i] ids += [idx] * 5 cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist() # Generate submission sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) sub.to_csv('sub.csv',index=False)",Yes,4,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import seaborn as sns sns.set() import matplotlib.pyplot as plt # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory import os print(os.listdir(""../input"")) # Any results you write to the current directory are saved as output.",No,4,88.0 "data_train = pd.read_csv('../input/train_users_2.csv',parse_dates=['timestamp_first_active','date_account_created','date_first_booking']) data_test = pd.read_csv('../input/test_users.csv',parse_dates=['timestamp_first_active','date_account_created','date_first_booking']) data_train.head()",No,4,45.0 "data_test.head() # Note that 'date_first_booking' is completely missing in test data",No,5,41.0 "# Note incorrect minimum and maximum age values data_train.describe()",No,5,40.0 "print ('Number of lines in the training data are,',data_train.shape[0])",No,5,58.0 "countries = pd.read_csv('../input/countries.csv') countries.head(10)",No,4,45.0 data_all.head(),No,5,41.0 print(data_all.isnull().sum()),No,5,39.0 "# Splitting date time data for date account created data_all['dac_year'] = data_all.date_account_created.dt.year data_all['dac_month'] = data_all.date_account_created.dt.month data_all['dac_day'] = data_all.date_account_created.dt.day # Splitting date time data for time first active data_all['tfa_year'] = data_all.timestamp_first_active.dt.year data_all['tfa_month'] = data_all.timestamp_first_active.dt.month data_all['tfa_day'] = data_all.timestamp_first_active.dt.day data_all.drop('date_account_created',1, inplace=True) data_all.drop('timestamp_first_active',1, inplace=True)",No,4,8.0 data_all.describe(),No,5,40.0 data_all.gender.value_counts(dropna=False).plot(kind='bar'),No,5,33.0 "data_all.dac_year.value_counts(sort=False).plot(kind='bar', title='Number of User Accounts Created in a Year')",No,5,33.0 "data_all.tfa_year.value_counts(sort=False).plot(kind='bar', title = 'Number of Users by First Active Year')",No,5,33.0 "data_train.country_destination.value_counts(normalize=True).plot(kind='bar',title='Countries Visited by AirBNB Users')",No,5,33.0 data_all.language.value_counts(sort=True),No,5,72.0 "#Note no null data now left print(data_all.isnull().sum())",No,5,39.0 "b""# Import sklearn.preprocessing.StandardScaler\n#from sklearn.preprocessing import MinMaxScaler\n\n# Initialize a MinMax scaler, then apply it to the numerical features\n#scaler = MinMaxScaler()\n#numerical = ['age','dac_year','dac_month','dac_day','tfa_year','tfa_month','tfa_day']\n#data_all[numerical] = scaler.fit_transform(data_all[numerical])\n\n# Create categorical columns\nfeatures = ['gender','signup_method','signup_flow','language','affiliate_channel','affiliate_provider',\\\n 'first_affiliate_tracked','signup_app','first_device_type','first_browser']\n\n# get dummies\ndata_all = pd.get_dummies(data_all,columns=features)\n""",No,5,20.0 "#Taking the 5 classes with highest probabilities ids = [] #list of ids cts = [] #list of countries for i in range(len(test_ids)): idx = test_ids[i] ids += [idx] * 5 cts += labler.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist() #Generate submission sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) sub.to_csv('submission.csv',index=False)",No,5,25.0 "import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt import seaborn as sns import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename))",No,5,88.0 "# Load the data into DataFrames train_users = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/train_users_2.csv') test_users = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/test_users.csv')",No,5,45.0 "print(""Number of users in training set ="", train_users.shape[0] ) print(""Number of users in test set ="",test_users.shape[0])",No,5,58.0 train_users.head(),No,5,41.0 train_users.describe(include = 'all'),No,5,40.0 test_users.head(),No,5,41.0 test_users.describe(include = 'all'),No,5,40.0 "labels = train_users['country_destination'].values train_users = train_users.drop(['country_destination', 'date_first_booking'], axis=1) test_users = test_users.drop(['date_first_booking'], axis=1) id_test = test_users['id'] # Merge train and test users all_users = pd.concat((train_users, test_users), axis=0, ignore_index=True) # Remove ID's since now we are not interested in making predictions all_users.drop('id',axis=1, inplace=True) all_users.head()",No,4,10.0 "from datetime import datetime all_users['date_account_created'] = pd.to_datetime(all_users['date_account_created']) all_users['timestamp_first_active'] = pd.to_datetime((all_users.timestamp_first_active // 1000000), format='%Y%m%d') all_users['date_account_created'] = [datetime.timestamp(d) for d in all_users['date_account_created']] all_users['timestamp_first_active'] = [datetime.timestamp(d) for d in all_users['timestamp_first_active']]",No,5,16.0 all_users.age.describe(),No,5,40.0 "sns.distplot(all_users.age.dropna()) plt.xlabel('Age')",No,5,33.0 "sns.distplot(all_users.age.loc[all_users['age'] < 70].dropna()) plt.xlabel('Age')",No,5,33.0 "all_users['age'] = np.where(all_users['age']<=14, 14, all_users['age']) all_users['age'] = np.where(all_users['age']>=70, 70, all_users['age']) all_users['age'] = all_users['age'].fillna(all_users['age'].dropna().values.mean()) all_users['age'].describe()",No,4,8.0 all_users['age'].values.mean(),No,5,40.0 "categorical_features = [ 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'first_browser', 'first_device_type', 'gender', 'language', 'signup_app', 'signup_method' ] # one-hot-encoding for categorical_feature in categorical_features: all_users_dummies = pd.get_dummies(all_users[categorical_feature], prefix=categorical_feature) all_users = all_users.drop([categorical_feature], axis=1) all_users = pd.concat((all_users, all_users_dummies), axis=1)",No,4,20.0 all_users.head(),No,5,41.0 "from sklearn.preprocessing import LabelEncoder train_users_n = train_users.shape[0] X_train = all_users.values[:train_users_n] le = LabelEncoder() y_train = le.fit_transform(labels) X_test = all_users.values[train_users_n:]",No,5,20.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import seaborn as sns import matplotlib.pyplot as plt # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory %matplotlib inline sns.set_style(""white"") sns.set_context('talk') import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # Any results you write to the current directory are saved as output.",No,4,88.0 "# Read The Files train = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/train_users_2.csv') age = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/age_gender_bkts.csv') countries = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/countries.csv') sessions = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/sessions.csv') test = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/test_users.csv')",No,5,45.0 "print(train.describe()) print(train.info())",No,5,40.0 "print(test.describe()) print(test.info())",No,5,40.0 "#Join the test & train data to fix them both at the same time df = train.append(test, ignore_index = True, sort = True) print(df.info()) ",No,4,11.0 "#Plotting distribution of the data for x in cat: sns.countplot(x=x, data=df,palette='RdBu') plt.ylabel('Number of users') plt.title('Users '+ x + ' Distribution') plt.xticks(rotation='vertical') plt.show() plt.savefig('plot'+str(x)+'.png') ",No,5,33.0 "# Investigate The Time Users Spend Between Being First Active and Actually Making A reservation df['timestamp_first_active'] = pd.to_datetime((df.timestamp_first_active // 1000000), format='%Y%m%d') df['date_first_booking'] = pd.to_datetime(df['date_first_booking']) df['time_to_booking']= df['date_first_booking'] - df['timestamp_first_active'] print(df.time_to_booking.describe())",No,4,16.0 "# Investigate Month and Year Of Users Bookings And Signing up to see most active years/months df['month_booking']= df.date_first_booking.dt.month df['year_booking']= df.date_first_booking.dt.year df['date_account_created'] = pd.to_datetime(df['date_account_created']) df['month_create']=df.date_account_created.dt.month df['year_create']=df.date_account_created.dt.year",No,4,16.0 "for x in ['month_booking','year_booking','month_create','year_create'] : sns.countplot(x=x,data=df) plt.xticks(rotation='vertical') plt.show() plt.savefig('plot'+str(x)+'.png') ",No,5,33.0 "df.date_account_created.value_counts().plot(kind='line') plt.xlabel('Date') plt.title('New Accounts Created Over Time') plt.xticks(rotation='vertical') plt.show() plt.savefig('plot New Accounts Created Over Time.png') ",No,5,75.0 "new2 = sessions.groupby('user_id').count() print(new2.describe())",No,4,60.0 "#Drop Year Column because it's the same for all entries (2015) age = age.drop('year',axis = 1)",No,5,10.0 "# Group and Plot Age Data g = age.groupby(['age_bucket','gender']).sum().reset_index().sort_values('population_in_thousands') sns.set_context('talk') sns.barplot(x='age_bucket',y = 'population_in_thousands',data=g) plt.xticks(rotation='vertical') plt.title('Different Age Groups') plt.show() plt.savefig('plot Different Age Groups.png') ",No,5,33.0 "# The Age Data #set any value bigger than 130 or lower than 18 to be nan df.age[df.age > 110] = np.nan df.age[df.age < 18] = np.nan #Replace Missing age data with the mean df.loc[df['age'].isnull(),'age'] = df.age.median()",No,5,17.0 "#look at age distribution sns.distplot(df.age) plt.title('Age Distribution Of Users') plt.show()",No,5,33.0 "#Extract the remaining date information df['month_active']= df.timestamp_first_active.dt.month df['year_active']= df.timestamp_first_active.dt.year",No,5,8.0 "#Drop unnecessary columns after the extraction of useful data df1 = df.drop(['date_first_booking','time_to_booking','month_booking','year_booking','date_account_created', 'timestamp_first_active','timestamp_first_active','country_destination','id'],axis=1)",No,5,10.0 "# Handle categorical Columns ndf = pd.get_dummies(df1,columns=['affiliate_channel','affiliate_provider','first_affiliate_tracked', 'first_browser','first_device_type','language','signup_app','year_active' ,'signup_flow','signup_method','month_create','year_create','month_active'], drop_first =True,dtype='float16')",No,5,20.0 ndf.head(),No,5,41.0 "from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=5) neigh.fit(xtrn1,ytrn1)",No,5,7.0 yprd = neigh.predict(xtst1),No,5,48.0 "yprd1 = pd.DataFrame(yprd) yprd1.index = ytst1.index xtfinal = pd.concat([yprd1,xtst1],axis=1)",Yes,5,11.0 "xtfinal.rename(columns={0:'gender'},inplace = True )",No,5,61.0 "xtrain_final = pd.concat([ytrn1,xtrn1],axis=1)",No,5,11.0 "xfinal = xtrain_final.append(xtfinal) xfinal = pd.concat([xfinal,df.country_destination],axis=1)",No,5,11.0 xfinal.head(),No,5,41.0 "xy = pd.get_dummies(xfinal,columns=['gender'],drop_first =True,dtype='float16') ",No,5,20.0 "from sklearn.preprocessing import StandardScaler from sklearn.utils.class_weight import compute_class_weight from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV from sklearn.naive_bayes import BernoulliNB",No,5,22.0 "sc = StandardScaler() x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test)",No,5,18.0 "from xgboost.sklearn import XGBClassifier clf = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) clf.fit(x_train,y_train)",No,5,7.0 y_pred = clf.predict(x_test),No,5,48.0 "submission = pd.DataFrame({'id':test['id'],'country':y_pred}) submission.head() filename = 'Airbnb Predictions 1_1.csv' submission.to_csv(filename,index=False)",No,4,25.0 "df = train.append(test, ignore_index = True, sort = True) ",No,5,11.0 "df.age[df.age > 110] = np.nan df.age[df.age < 18] = np.nan #Replace Missing age data with the mean df.loc[df['age'].isnull(),'age'] = -1",No,5,17.0 "# Extracting Age Data As before df['timestamp_first_active'] = pd.to_datetime((df.timestamp_first_active // 1000000), format='%Y%m%d') df['day_active'] = df.timestamp_first_active.dt.day df['month_active']= df.timestamp_first_active.dt.month df['year_active']= df.timestamp_first_active.dt.year df['date_account_created'] = pd.to_datetime(df['date_account_created']) df['day_create'] = df.date_account_created.dt.day df['month_create']=df.date_account_created.dt.month df['year_create']=df.date_account_created.dt.year",No,5,8.0 "ndf2 = df.drop(['date_first_booking','date_account_created', 'timestamp_first_active','timestamp_first_active','id'],axis = 1 )",No,5,10.0 "xy2 = pd.get_dummies(ndf2,columns=['affiliate_channel','affiliate_provider','first_affiliate_tracked', 'first_browser','first_device_type','gender','language','signup_app','signup_flow' ,'signup_method'])",No,5,20.0 "from xgboost.sklearn import XGBClassifier clf = XGBClassifier(max_depth=6, learning_rate=0.2, n_estimators=50,class_weight=class_weight , objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) clf.fit(x_train,y_train)",No,5,7.0 "submission = pd.DataFrame({'id':test['id'],'country':y_pred}) submission.head() filename = 'Airbnb Predictions 1_2.csv' submission.to_csv(filename,index=False)",No,4,25.0 "train = pd.merge(train, countries, how=""left"", left_on=[""country_destination""], right_on=[""country_destination""])",No,5,32.0 "x_train = train.drop(""country_destination"", axis=1) y_train = train[""country_destination""] x_test = test.copy()",Yes,5,21.0 "import numpy as np from sklearn import preprocessing for f in x_train.columns: if x_train[f].dtype=='object': lbl = preprocessing.LabelEncoder() if f not in x_test.columns: lbl.fit(np.unique(list(x_train[f].values))) x_train[f] = lbl.transform(list(x_train[f].values)) else: lbl.fit(np.unique(list(x_train[f].values) + list(x_test[f].values))) x_train[f] = lbl.transform(list(x_train[f].values)) x_test[f] = lbl.transform(list(x_test[f].values))",Yes,5,20.0 "for col in countries.columns: if col == 'country_destination': continue del(x_train[col])",No,4,10.0 "from sklearn.preprocessing import Imputer imp = Imputer(missing_values='NaN', strategy='mean', axis=0) x_train_nonan = imp.fit_transform(x_train) x_test_nonan = imp.fit_transform(x_test)",Yes,5,17.0 "from sklearn.tree import DecisionTreeClassifier model = DecisionTreeClassifier() model.fit(x_train_nonan, y_train)",Yes,5,7.0 y_test = model.predict(x_test_nonan),No,5,48.0 "submission = pd.DataFrame() submission[""id""] = test[""id""] submission[""country""] = y_test submission.to_csv('airbnb.csv', index=False)'",Yes,5,25.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.metrics import accuracy_score from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder",No,5,22.0 "train = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip') test = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/test_users.csv.zip') age_gender = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/age_gender_bkts.csv.zip') sessions = pd.read_csv('/kaggle/input/airbnb-recruiting-new-user-bookings/sessions.csv.zip')",No,5,45.0 "X_train = train.drop(['date_first_booking', 'country_destination'], axis=1) X_test = test.drop(['date_first_booking'], axis=1)",No,5,10.0 "y_des = train['country_destination'].values X=pd.concat((X_train, X_test), axis=0, ignore_index=True) X.shape",Yes,4,11.0 X.fillna(method='pad').head(),No,4,17.0 "X.loc[X.age > 90, 'age'] = -1 X.loc[X.age < 13, 'age'] = -1 X['age'].describe()",Yes,4,8.0 "X.loc[X.age.isnull(), 'age']=X.age.mean()",No,5,17.0 "dac = np.vstack( X.date_account_created.astype(str).apply( lambda x: list(map(int, x.split('-'))) ).values ) X['dac_year'] = dac[:, 0] X['dac_month'] = dac[:, 1] X['dac_day'] = dac[:, 2] X = X.drop(['date_account_created'], axis=1) X.head()",Yes,4,8.0 "df = sessions.user_id.value_counts() print(df.shape) print(df.head())",Yes,3,72.0 "df = df.to_frame() df = df.rename(columns = {'user_id' : 'session_count'}) df['id'] = df.index df.head()",Yes,4,61.0 "X = pd.merge(X, df, how = 'left', on = ['id']) X.session_count.fillna(-1, inplace = True) X.session_count = X.session_count.astype(int)",Yes,3,32.0 "tfa = np.vstack( X.timestamp_first_active.astype(str).apply( lambda x: list(map(int, [x[:4], x[4:6], x[6:8], x[8:10], x[10:12], x[12:14]])) ).values ) X['tfa_year'] = tfa[:, 0] X['tfa_month'] = tfa[:, 1] X['tfa_day'] = tfa[:, 2] X = X.drop(['timestamp_first_active'], axis=1)",Yes,4,8.0 "# age distributions train['corrected_age']=train['age'].apply(lambda x : 36 if x>90 or x<10 else x) sns.distplot(train.corrected_age.dropna())",Yes,5,33.0 "# percentage of users using different signup_method signup_method = X.signup_method.value_counts(dropna = False) / len(X) * 100 signup_method.plot('bar', rot = 0) plt.xlabel('Sign up method') plt.ylabel('Percentage of signup_method')",Yes,5,33.0 "# percentage of gender gender = X.gender.value_counts(dropna = False) / len(X) * 100 gender.plot('bar', rot = 0) plt.xlabel('gender') plt.ylabel('Percentage of gender')",Yes,5,33.0 "# percentage of people going to different countries des_countries = train.country_destination.value_counts(dropna = False) / len(train) * 100 des_countries.plot('bar', rot = 0) plt.xlabel('Destination country') plt.ylabel('Percentage of booking')",Yes,5,33.0 "# Relavance between Age and destination sns.set_style('ticks') fig, ax = plt.subplots() fig.set_size_inches(10, 7) sns.boxplot(y='age' , x='country_destination',data=train) plt.xlabel('Destination Country box plot',size=15) plt.ylabel('Age of Users', size=15) plt.tick_params(labelsize=12)",No,4,81.0 "# relevance between age and signup method sns.set_style('ticks') fig, ax = plt.subplots() fig.set_size_inches(6, 4) sns.boxplot(y='age' , x='signup_method',data=train) plt.xlabel('Signup method', size=15) plt.ylabel('age', size=15) plt.tick_params(labelsize=12) #sns.despine()",No,4,81.0 "# relevence between age and signup app sns.set_style('ticks') fig, ax = plt.subplots() fig.set_size_inches(6, 4) sns.boxplot(y='age' , x='signup_app',data=train) plt.xlabel('Signup app',size=15) plt.ylabel('Age of Users', size=15) plt.tick_params(labelsize=12) #sns.despine()",No,4,81.0 "#relevence between age and language sns.set_style('ticks') fig, ax = plt.subplots() fig.set_size_inches(8, 5) sns.boxplot(y='age' , x='language',data=train) plt.xlabel('Language', size=15) plt.ylabel('Age of Users', size=15) plt.tick_params(labelsize=12) #sns.despine()",No,4,81.0 "# relevance between age and gender sns.set_style('ticks') fig, ax = plt.subplots() fig.set_size_inches(6, 4) sns.boxplot(y='age' , x='gender',data=train) plt.xlabel('Gender', size=15) plt.ylabel('Age of Users', size=15) plt.tick_params(labelsize=10) #sns.despine()",No,4,81.0 "# chart for number of account created train['date_account_created_new'] = pd.to_datetime(train['date_account_created']) sns.set_style('ticks') fig, ax = plt.subplots() fig.set_size_inches(10, 8) train.date_account_created_new.value_counts().plot(kind='line', linewidth=1, color='#1F618D') plt.xlabel('Date ', size=20) plt.ylabel('Number of account created ', size=15) plt.tick_params(labelsize=12) #sns.despine()",No,5,75.0 "oh_features = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']",No,5,77.0 "for feature in oh_features: X_dummy = pd.get_dummies(X[feature], prefix=feature) X = X.drop([feature], axis=1) X = pd.concat((X, X_dummy), axis=1) X.head()",Yes,3,20.0 "#split the well processed dataset into X_train and X_test X_train = X.iloc[:len(train), :] X_test = X.iloc[len(train):, :] X_train = X_train.drop(['id'], axis=1) X_train.shape X_test = X_test.drop(['id'], axis=1)",Yes,4,13.0 "le = LabelEncoder() y_trans = le.fit_transform(y_des) y_trans.shape",Yes,4,20.0 "dtrain, dtest, train_label, test_label = train_test_split(X_train, y_trans, test_size = 0.3, random_state = 817)",No,5,13.0 "#logistic regression from sklearn.linear_model import LogisticRegression logreg = LogisticRegression() logreg.fit(dtrain, train_label) pred_log=logreg.predict(dtest) from sklearn.metrics import accuracy_score print(accuracy_score(test_label, pred_log))",Yes,3,7.0 "from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(max_depth=20, n_estimators=100) rfc.fit(dtrain , train_label) pred = rfc.predict(dtest) print(accuracy_score(test_label, pred))",Yes,3,7.0 "fi=pd.Series(rfc.feature_importances_, index=dtrain.columns) fn=fi.sort_values(ascending=True) fn[-20:].plot(kind='barh', color='r', figsize=(25, 12)) plt.xlabel('importance', size=15) plt.title('Random Forest Importance', size=20) plt.tick_params(labelsize=15)",No,5,79.0 "from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier(max_depth=10) dtc.fit(dtrain , train_label) pred = dtc.predict(dtest) print(accuracy_score(test_label, pred))",Yes,3,7.0 "from xgboost.sklearn import XGBClassifier xgb = XGBClassifier(max_depth=4, learning_rate=0.03, n_estimators=100, objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=40) xgb.fit(dtrain , train_label) pred = xgb.predict(dtest) print(accuracy_score(test_label, pred))",Yes,3,7.0 "# only XGBoost xgb = XGBClassifier(max_depth=4, learning_rate=0.03, n_estimators=100, objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=40) xgb.fit(X_train, y_trans) XGBC_pred_test = xgb.predict(X_test) XGBC_pred_test_prob=xgb.predict_proba(X_test)",Yes,3,7.0 "ids_test = test['id'] ids = [] countries = [] for i in range(len(X_test)): idx = ids_test[i] ids += [idx] * 5 countries += le.inverse_transform(np.argsort(XGBC_pred_test_prob[i])[::-1][:5]).tolist()",No,5,53.0 "submission = pd.DataFrame({ ""id"" : ids, ""country"" : countries }) submission.to_csv('submission_XGBC.csv', index = False)'",No,5,25.0 "n_labels=len(set(y_des)) n_labels",No,5,77.0 "params = { 'objective': 'multi:softprob', 'eval_metric': 'merror', 'num_class': n_labels, 'eta': 0.5, 'max_depth': 6, 'subsample': 0.5, 'colsample_bytree': 0.3, 'silent': 1, 'seed': 123 }",No,5,59.0 "import xgboost as xgb num_boost_round = 50 Dtrain = xgb.DMatrix(X_train, y_trans) res = xgb.cv(params, Dtrain, num_boost_round=num_boost_round, nfold=5, callbacks=[xgb.callback.print_evaluation(show_stdv=True), xgb.callback.early_stop(50)])",No,5,28.0 "num_boost_round = res['test-merror-mean'].idxmin() print(format(num_boost_round)) clf = xgb.train(params, Dtrain, num_boost_round=num_boost_round) clf",Yes,5,7.0 "import operator importance = clf.get_fscore() importance_df = pd.DataFrame( sorted(importance.items(), key=operator.itemgetter(1)), columns=['feature', 'fscore'] )",Yes,5,79.0 "importance_df = importance_df.iloc[-20:, :]",No,5,14.0 "plt.figure() importance_df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(20, 10)) plt.title('XGBoost Feature Importance', size=25) plt.xlabel('Relative importance', size=20) plt.ylabel('Features', size=20) plt.tick_params(labelsize=15) #plt.gcf().savefig('feature_importance.png')",No,5,79.0 "#importing neccessary libraries import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns import datetime from sklearn.preprocessing import LabelEncoder from xgboost.sklearn import XGBClassifier import warnings warnings.filterwarnings('ignore') ",No,5,23.0 "sessions=pd.read_csv('../input/airbnb-recruiting-new-user-bookings/sessions.csv.zip') countries=pd.read_csv('../input/airbnb-recruiting-new-user-bookings/countries.csv.zip') age_gender=pd.read_csv('../input/airbnb-recruiting-new-user-bookings/age_gender_bkts.csv.zip') submission=pd.read_csv('../input/airbnb-recruiting-new-user-bookings/sample_submission_NDF.csv.zip')",No,5,45.0 "test_ids = data_test['id'] Nrows_train = data_train.shape[0] # Store country names labels = data_train['country_destination'].values data_train1 = data_train.drop(['country_destination'], axis=1) # Combining the test and train data. If this is not done, the number of dummy variable columns do not match in test and train data. # Some items present in train data and are not present in test data. For example, browser type. data_all = pd.concat((data_train1, data_test), axis = 0, ignore_index = True) # Dropping ids which are saved separately and date of first booking which is completely absent in the test data data_all = data_all.drop(['id','date_first_booking'], axis=1) ",Yes,3,10.0 "data_all.loc[data_all.age > 100, 'age'] = np.nan data_all.loc[data_all.age < 18, 'age'] = np.nan",No,5,8.0 "data_all.groupby('gender').age.agg(['min','max','mean','count'])",No,5,60.0 data_all.groupby('gender').age.mean().plot(kind='bar'),No,5,33.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt import seaborn as sns from xgboost.sklearn import XGBClassifier from datetime import datetime from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, accuracy_score, classification_report # Input data files are available in the read-only ""../input/"" directory # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using ""Save & Run All"" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session'",No,5,88.0 "train=pd.read_csv('../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip') test = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/test_users.csv.zip') #sessions=pd.read_csv('../input/airbnb-recruiting-new-user-bookings/sessions.csv.zip') #countries=pd.read_csv('../input/airbnb-recruiting-new-user-bookings/countries.csv.zip') #age_gender=pd.read_csv('../input/airbnb-recruiting-new-user-bookings/age_gender_bkts.csv.zip')",No,5,45.0 "b""df.info() # Bo deeri olan stunlar belirlemek ve data tiplerini anlamak iin info'ya bakyorum. """,No,5,40.0 train.info() ,No,5,40.0 df['aged'].describe() ,No,5,40.0 "plt.figure(figsize=(30,15)) sns.countplot(x='aged', data=df)",No,5,33.0 "b""df.loc[df['aged']>=85, 'aged']=np.nan #grafie baknca 85'ten sonrasn almamaya karar verdim, onlar da NaN ile doldurdum.""",No,5,8.0 df['aged'].describe(),No,5,40.0 "b""df.isnull().sum() # test datas dnda hala bo hcresi olan 'first_affiliate_tracked' grnyor.""",No,5,39.0 "b""df['first_active_date']=first_active_date # timestamp_first_active'den first active date ve time' ayrarak ayr iki stun oluturdum\ndf['first_active_time']=first_active_time\ndf=df.drop(['timestamp_first_active'], axis=1)""",No,4,8.0 "df.select_dtypes(""object"").columns",No,5,71.0 " le = LabelEncoder() df['signup_method']= le.fit_transform(df['signup_method']) df['language']= le.fit_transform(df['language']) df['affiliate_channel']= le.fit_transform(df['affiliate_channel']) df['affiliate_provider']= le.fit_transform(df['affiliate_provider']) df['signup_app']= le.fit_transform(df['signup_app']) df['first_device_type']= le.fit_transform(df['first_device_type']) df['gender']= le.fit_transform(df['gender']) df['first_browser']= le.fit_transform(df['first_browser']) df['first_affiliate_tracked']= le.fit_transform(df['first_affiliate_tracked'])",No,5,20.0 "df['country_destination'].replace('US',1, inplace=True) df['country_destination'].replace('other',2, inplace=True) df['country_destination'].replace('FR',3, inplace=True) df['country_destination'].replace('CA',4, inplace=True) df['country_destination'].replace('GB',5, inplace=True) df['country_destination'].replace('ES',6, inplace=True) df['country_destination'].replace('IT',7, inplace=True) df['country_destination'].replace('PT',8, inplace=True) df['country_destination'].replace('DE',9, inplace=True) df['country_destination'].replace('NL',10, inplace=True) df['country_destination'].replace('AU',11, inplace=True)",No,5,20.0 "b""dft=df[df['Train']==1] #sadece train datasn almak iin\nX=dft.drop(['country_destination','Train', 'dfb_year', 'dfb_month', 'dfb_day'],axis=1) # feature tanmlama\nY = dft['country_destination'] # target tanmlama""",No,5,21.0 "x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.3, random_state = 42) #data ayrma'",No,5,13.0 "from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(criterion='entropy', max_depth= 8, max_leaf_nodes=30, min_samples_leaf=30, n_estimators= 100, random_state=0) rfc.fit(x_train, y_train) prediction = pd.DataFrame(data=rfc.predict(x_test), index = x_test.index) classification_metrics(y_test, prediction)",No,3,7.0 "pred_country={ 1:""US"", 2:""other"", 3:""FR"", 4:""CA"", 5:""GB"", 6:""ES"", 7:""IT"", 8:""PT"", 9:""DE"", 10:""NL"", 11:""AU""}",No,5,77.0 "b""dftest=df[df['Train']==0]# submission yapmak iin hazrlk""",No,5,14.0 tested = xgb.predict(testX)# submission yapmak iin hazrlk',No,5,48.0 "results=[] for i in tested: results.append(pred_country[i]) print(results)",No,5,53.0 "my_submission = pd.DataFrame({'id': mysubmission_ID, 'country':results}) my_submission.to_csv('submission.csv', index=False)",No,5,25.0 "# Imports # pandas import pandas as pd from pandas import Series,DataFrame # numpy, matplotlib, seaborn import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set_style('whitegrid') %matplotlib inline # machine learning from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB import xgboost as xgb",No,5,23.0 "# get homesite & test csv files as a DataFrame homesite_df = pd.read_csv(""../input/train.csv"") test_df = pd.read_csv(""../input/test.csv"") # preview the data homesite_df.head()",No,5,45.0 "homesite_df.info() print(""----------------------------"") test_df.info()",No,5,40.0 "# drop unnecessary columns, these columns won't be useful in analysis and prediction homesite_df = homesite_df.drop(['QuoteNumber'], axis=1)",No,5,10.0 "# date # Convert Date to Year, Month, and Week homesite_df['Year'] = homesite_df['Original_Quote_Date'].apply(lambda x: int(str(x)[:4])) homesite_df['Month'] = homesite_df['Original_Quote_Date'].apply(lambda x: int(str(x)[5:7])) homesite_df['Week'] = homesite_df['Original_Quote_Date'].apply(lambda x: int(str(x)[8:10])) test_df['Year'] = test_df['Original_Quote_Date'].apply(lambda x: int(str(x)[:4])) test_df['Month'] = test_df['Original_Quote_Date'].apply(lambda x: int(str(x)[5:7])) test_df['Week'] = test_df['Original_Quote_Date'].apply(lambda x: int(str(x)[8:10])) homesite_df.drop(['Original_Quote_Date'], axis=1,inplace=True) test_df.drop(['Original_Quote_Date'], axis=1,inplace=True)",No,4,8.0 "# customers purchased insurance plan # Plot sns.countplot(x=""QuoteConversion_Flag"", data=homesite_df)",No,5,33.0 "# year # Which year has higher number of customers purchased insurance plan # Plot fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5)) sns.countplot(x=""QuoteConversion_Flag"",hue=""Year"", data=homesite_df, ax=axis1) sns.countplot(x=homesite_df[""Year""].loc[homesite_df[""QuoteConversion_Flag""] == 1], order=[2013,2014,2015], ax=axis2)",No,5,33.0 "# month # Which month has higher number of customers purchased insurance plan # Plot sns.countplot(x=homesite_df[""Month""].loc[homesite_df[""QuoteConversion_Flag""] == 1], order=[1,2,3,4,5,6,7,8,9,10,11,12])",No,5,33.0 "# fill NaN values homesite_df.fillna(-1, inplace=True) test_df.fillna(-1, inplace=True)",No,5,17.0 "# There are some columns with non-numerical values(i.e. dtype='object'), # So, We will create a corresponding unique numerical value for each non-numerical value in a column of training and testing set. from sklearn import preprocessing for f in homesite_df.columns: if homesite_df[f].dtype=='object': lbl = preprocessing.LabelEncoder() lbl.fit(np.unique(list(homesite_df[f].values) + list(test_df[f].values))) homesite_df[f] = lbl.transform(list(homesite_df[f].values)) test_df[f] = lbl.transform(list(test_df[f].values))",No,5,20.0 "# define training and testing sets X_train = homesite_df.drop(""QuoteConversion_Flag"",axis=1) Y_train = homesite_df[""QuoteConversion_Flag""] X_test = test_df.drop(""QuoteNumber"",axis=1).copy()",No,5,21.0 "# Create submission submission = pd.DataFrame() submission[""QuoteNumber""] = test_df[""QuoteNumber""] submission[""QuoteConversion_Flag""] = Y_pred submission.to_csv('homesite.csv', index=False)'",No,4,25.0 "import pandas as pd import xgboost as xgb import numpy as np from sklearn import preprocessing df = pd.read_csv(""../input/train.csv"") test = pd.read_csv(""../input/test.csv"") df['Date']=pd.to_datetime(pd.Series(df['Original_Quote_Date'])) df['Year']=df['Date'].apply(lambda x: int(str(x)[:4])) df['Month']=df['Date'].apply(lambda x: int(str(x)[5:7])) df['Date']=df['Date'].apply(lambda x: int(str(x)[8:10])) df['Field10'].apply(lambda x : int(x.replace(',','')) ) test['Date']=pd.to_datetime(pd.Series(test['Original_Quote_Date'])) test['Year']=test['Date'].apply(lambda x: int(str(x)[:4])) test['Month']=test['Date'].apply(lambda x: int(str(x)[5:7])) test['Date']=test['Date'].apply(lambda x: int(str(x)[8:10])) test['Field10'].apply(lambda x : int(x.replace(',',''))) label=df['QuoteConversion_Flag'] df.drop('QuoteConversion_Flag',axis=1,inplace=True) number=test['QuoteNumber'] drop_columns=['Original_Quote_Date','QuoteNumber'] for names in drop_columns: df.drop(names,axis=1,inplace=True) test.drop(names,axis=1,inplace=True) clf=xgb.XGBClassifier(max_depth=7,learning_rate=0.03,n_estimators=650,subsample=0.86,seed=50)'",No,3,8.0 "for f in df.columns: if df[f].dtypes=='object': encoder=preprocessing.LabelEncoder() encoder.fit( list(df[f])+list(test[f]) ) df[f]=encoder.transform(list(df[f].values)) test[f]=encoder.transform(list(test[f].values)) df.fillna(-1,inplace=True) test.fillna(-1,inplace=True)",No,4,20.0 "clf.fit(df,label) output=clf.predict_proba(test)[:,1]",No,4,48.0 "sample=pd.read_csv('../input/sample_submission.csv') sample.QuoteConversion_Flag=output sample.to_csv('final.csv',index=False)",No,4,25.0 "# Imports # pandas import pandas as pd from pandas import Series,DataFrame # numpy, matplotlib, seaborn import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set_style('whitegrid') %matplotlib inline # machine learning from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.preprocessing import maxabs_scale import xgboost as xgb ",No,5,23.0 "# fill NaN values homesite_df.fillna(homesite_df.mean(), inplace=True) test_df.fillna(test_df.mean(), inplace=True) # define training and testing sets X_train = homesite_df.drop(""QuoteConversion_Flag"",axis=1) Y_train = homesite_df[""QuoteConversion_Flag""] X_test = test_df.drop(""QuoteNumber"",axis=1).copy() X_train = maxabs_scale(X_train) X_test = maxabs_scale(X_test)",No,4,21.0 "import pandas as pd from collections import Counter %pylab inline ",No,5,23.0 "trainPath = r'../input/train.csv' testPath = r'../input/test.csv'",No,5,77.0 "train_df = pd.read_csv(trainPath) test_df = pd.read_csv(testPath)",No,5,45.0 "#deal character data train_df['Year'] = train_df['Original_Quote_Date'].apply(lambda x: int(str(x)[:4])) train_df['Month'] = train_df['Original_Quote_Date'].apply(lambda x: int(str(x)[5:7])) train_df['Week'] = train_df['Original_Quote_Date'].apply(lambda x: int(str(x)[8:10])) test_df['Year'] = test_df['Original_Quote_Date'].apply(lambda x: int(str(x)[:4])) test_df['Month'] = test_df['Original_Quote_Date'].apply(lambda x: int(str(x)[5:7])) test_df['Week'] = test_df['Original_Quote_Date'].apply(lambda x: int(str(x)[8:10])) train_df.drop(['Original_Quote_Date'], axis=1,inplace=True) test_df.drop(['Original_Quote_Date'], axis=1,inplace=True)",No,4,8.0 "#get character data columns notNumTypeCol = [col for col in train_df.columns if train_df[col].dtype == dtype('O')]",No,5,77.0 "train_x_df = train_df.drop(""QuoteConversion_Flag"",axis=1) train_y_df = train_df[""QuoteConversion_Flag""]",No,5,21.0 "train = pd.read_csv(""../input/train.csv"") test = pd.read_csv(""../input/test.csv"")",No,5,45.0 "train = train.drop('QuoteNumber', axis=1) test = test.drop('QuoteNumber', axis=1) # Lets play with some dates train['Date'] = pd.to_datetime(pd.Series(train['Original_Quote_Date'])) train = train.drop('Original_Quote_Date', axis=1) test['Date'] = pd.to_datetime(pd.Series(test['Original_Quote_Date'])) test = test.drop('Original_Quote_Date', axis=1) train['Year'] = train['Date'].apply(lambda x: int(str(x)[:4])) train['Month'] = train['Date'].apply(lambda x: int(str(x)[5:7])) train['weekday'] = train['Date'].dt.dayofweek test['Year'] = test['Date'].apply(lambda x: int(str(x)[:4])) test['Month'] = test['Date'].apply(lambda x: int(str(x)[5:7])) test['weekday'] = test['Date'].dt.dayofweek train = train.drop('Date', axis=1) test = test.drop('Date', axis=1)",No,4,8.0 "from sklearn.model_selection import StratifiedKFold clf = GridSearchCV(xgb_model, parameters, cv=StratifiedKFold(n_splits=5, shuffle=True), scoring='roc_auc', verbose=2, refit=True) clf.fit(train[features], train[""QuoteConversion_Flag""])'",No,5,6.0 "test_probs = clf.predict_proba(test[features])[:,1]",No,5,48.0 "import os import sys import random import warnings import numpy as np import pandas as pd import matplotlib.pyplot as plt from tqdm import tqdm from keras.models import Model, load_model from keras.layers import Input from keras.layers.core import Dropout, Lambda from keras.layers.convolutional import Conv2D, Conv2DTranspose from keras.layers.pooling import MaxPooling2D from keras.layers.merge import concatenate from keras.callbacks import EarlyStopping, ModelCheckpoint from keras import backend as K from keras.optimizers import Adam import tensorflow as tf from skimage.io import imread from skimage.transform import resize",No,5,22.0 "X_train = np.zeros((tot_num, IMG_HEIGHT, IMG_WIDTH), dtype=np.float32) Y_train = np.zeros((tot_num, IMG_HEIGHT, IMG_WIDTH), dtype=np.float32)",No,5,21.0 "X_train_one = np.array(X_train_one) Y_train_one = np.array(Y_train_one) X_train_zero = np.array(X_train_zero) Y_train_zero = np.array(Y_train_zero)",No,5,21.0 "X_train = [] Y_train = []",No,5,77.0 "X_train = np.array(X_train) Y_train = np.array(Y_train)",No,5,21.0 IMG_CHANNELS = 1,No,5,77.0 "!pip3 install git+https://github.com/qubvel/segmentation_models from segmentation_models import Unet # model = Unet('densenet121',encorder_weights='imagenet',freeze_encorder=True)",No,5,87.0 "results = model.fit(X_train_ax, Y_train_ax, validation_split=0.1, batch_size=8, epochs=18)",No,5,7.0 execute_data = False,No,5,77.0 execute_metric = False,No,5,77.0 "######################################################################################################################## # ====================================================================================================================== # u_model_blocks # ====================================================================================================================== ######################################################################################################################## # needed for u_model # standard-module imports from keras.layers import add, concatenate, Conv2D, MaxPooling2D from keras.layers import BatchNormalization, Lambda from keras.layers.advanced_activations import ELU, LeakyReLU # ====================================================================================================================== # utility blocks needed for internal performance # ====================================================================================================================== def NConv2D(filters, kernel_size, strides=(1, 1), padding='valid', dilation_rate=1, activation=None, kernel_initializer='glorot_uniform'): """"""Create a (Normalized Conv2D followed by a chosen activation) function Conv2D -> BatchNormalization -> activation() :param filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the convolution) :param kernel_size: An integer or tuple/list of 2 integers, specifying the height and width of the 2D convolution window. Can be a single integer to specify the same value for all spatial dimensions. :param strides: An integer or tuple/list of 2 integers, specifying the strides of the convolution along the height and width. Can be a single integer to specify the same value for all spatial dimensions. Specifying any stride value != 1 is incompatible with specifying any dilation_rate value != 1. :param padding: one of 'valid' or 'same' (case-insensitive), 'valid' by default to have the same as Conv2D :param dilation_rate: an integer or tuple/list of a single integer, specifying the dilation rate to use for dilated convolution. Currently, specifying any dilation_rate value != 1 is incompatible with specifying any strides value != 1 :param activation: string, one of 'elu' or 'relu' or None (case-sensitive), specifies activation function to be performed after BatchNormalization :param kernel_initializer: Initializer for the kernel weights matrix (see initializers in keras documentation) :return: a function, combined of 2D Convolution, followed by BatchNormalization across filters, and specified activation in that order """""" assert activation in ['relu', 'elu', None] # actv is a function, not a string, like activation actv = activation == 'relu' and (lambda: LeakyReLU(0.0)) or activation == 'elu' and (lambda: ELU(1.0)) or None def f(_input): conv = Conv2D(filters=filters, kernel_size=kernel_size, strides=strides, padding=padding, dilation_rate=dilation_rate, kernel_initializer=kernel_initializer)(_input) norm = BatchNormalization(axis=3)(conv) return actv()(norm) return f # needed for rblock (residual block) def _shortcut(_input, residual): stride_width = _input._keras_shape[1] / residual._keras_shape[1] stride_height = _input._keras_shape[2] / residual._keras_shape[2] equal_channels = residual._keras_shape[3] == _input._keras_shape[3] shortcut = _input # 1 X 1 conv if shape is different. Else identity. if stride_width > 1 or stride_height > 1 or not equal_channels: shortcut = Conv2D(filters=residual._keras_shape[3], kernel_size=(1, 1), strides=(stride_width, stride_height), kernel_initializer=""he_normal"", padding=""valid"")(_input) return add([shortcut, residual]) def rblock(inputs, filters, kernel_size, padding='valid', activation=None, scale=0.1): """"""Create a scaled Residual block connecting the down-path and the up-path of the u-net architecture Activations are scaled by a constant to prevent the network from dying. Usually is set between 0.1 and 0.3. See: https://towardsdatascience.com/a-simple-guide-to-the-versions-of-the-inception-network-7fc52b863202 :param inputs: Input 4D tensor (samples, rows, cols, channels) :param filters: Integer, the dimensionality of the output space (i.e. the number of output convolution filters) :param kernel_size: An integer or tuple/list of 2 integers, specifying the height and width of the 2D convolution window. Can be a single integer to specify the same value for all spatial dimensions. :param padding: one of 'valid' or 'same' (case-insensitive), 'valid' by default to have the same as Conv2D :param activation: string, one of 'elu' or 'relu' or None (case-sensitive), specifies activation function to use everywhere in the block :param scale: scaling factor preventing the network from dying out :return: 4D tensor (samples, rows, cols, channels) output of a residual block, given inputs """""" assert activation in ['relu', 'elu', None] # actv is a function, not a string, like activation actv = activation == 'relu' and (lambda: LeakyReLU(0.0)) or activation == 'elu' and (lambda: ELU(1.0)) or None residual = Conv2D(filters=filters, kernel_size=kernel_size, padding=padding)(inputs) residual = BatchNormalization(axis=3)(residual) residual = Lambda(lambda x: x * scale)(residual) res = _shortcut(inputs, residual) return actv()(res) # ====================================================================================================================== # information blocks # ====================================================================================================================== def convolution_block(inputs, filters, kernel_size=(3, 3), padding='valid', activation=None, version='normalized', pars={}, allowed_pars={}): """"""Create a version of a convolution block. Versions: with and without batch-normalization after convolutions. :param inputs: Input 4D tensor (samples, rows, cols, channels) :param filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the convolution). :param kernel_size: An integer or tuple/list of 2 integers, specifying the height and width of the 2D convolution window. Can be a single integer to specify the same value for all spatial dimensions. :param padding: one of 'valid' or 'same' (case-insensitive), 'valid' by default to have the same as Conv2D :param activation: string, specifies activation function to use everywhere in the block :param version: version of the convolution block, one of 'not_normalized', 'normalized' (case sensitive) :param pars: dictionary of parameters passed to u-net, determines the version, if this type of block is chosen :param allowed_pars: dictionary of all allowed to be passed to u-net parameters :return: 4D tensor (samples, rows, cols, channels) output of a convolution block, given inputs """""" assert activation in ['relu', 'elu', None] # checking that the allowed version names did not change in ALLOWED_PARS if allowed_pars != {}: assert allowed_pars.get('information_block').get('convolution').get('simple') == ['not_normalized', 'normalized'] # keep version argument if need to use without PARS assert version in ['not_normalized', 'normalized'] # setting the version from pars if pars.get('information_block').get('convolution').get('simple') is not None: version = pars.get('information_block').get('convolution').get('simple') if version == 'normalized': conv1 = NConv2D(filters=filters, kernel_size=kernel_size, activation=activation, padding=padding)(inputs) return NConv2D(filters=filters, kernel_size=kernel_size, activation=activation, padding=padding)(conv1) else: conv1 = Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, padding=padding)(inputs) return Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, padding=padding)(conv1) def dilated_convolution_block(inputs, filters, kernel_size=(3, 3), padding='valid', activation=None, version='normalized', pars={}, allowed_pars={}): """"""Create a version of a dilated-convolution block. Versions: with and without batch-normalization after dilated convolutions. See more about dilated convolutions: https://towardsdatascience.com/review-dilated-convolution-semantic-segmentation-9d5a5bd768f5 :param inputs: Input 4D tensor (samples, rows, cols, channels) :param filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the convolution). :param kernel_size: An integer or tuple/list of 2 integers, specifying the height and width of the 2D convolution window. Can be a single integer to specify the same value for all spatial dimensions. :param padding: one of 'valid' or 'same' (case-insensitive), 'valid' by default to have the same as Conv2D :param activation: string, specifies activation function to use everywhere in the block :param version: version of the dilated-convolution block, one of 'not_normalized', 'normalized' (case sensitive) :param pars: dictionary of parameters passed to u-net, determines the version, if this type of block is chosen :param allowed_pars: dictionary of all allowed to be passed to u-net parameters :return: 4D tensor (samples, rows, cols, channels) output of a dilated-convolution block, given inputs """""" assert activation in ['relu', 'elu', None] # checking that the allowed version names did not change in ALLOWED_PARS if allowed_pars != {}: assert allowed_pars.get('information_block').get('convolution').get('dilated') == ['not_normalized', 'normalized'] # keep version argument if need to use without PARS assert version in ['not_normalized', 'normalized'] # setting the version from pars if pars.get('information_block').get('convolution') is not None: version = pars.get('information_block').get('convolution') if version == 'normalized': conv1 = NConv2D(filters=filters, kernel_size=kernel_size, padding=padding, dilation_rate=2, activation=activation)(inputs) return NConv2D(filters=filters, kernel_size=kernel_size, padding=padding, dilation_rate=1, activation=activation)(conv1) else: conv1 = Conv2D(filters=filters, kernel_size=kernel_size, padding=padding, dilation_rate=2, activation=activation)(inputs) return Conv2D(filters=filters, kernel_size=kernel_size, padding=padding, dilation_rate=1, activation=activation)(conv1) def inception_block_v1(inputs, filters, activation=None, version='b', pars={}, allowed_pars={}): """"""Create a version of v1 inception block described in: https://towardsdatascience.com/a-simple-guide-to-the-versions-of-the-inception-network-7fc52b863202 Create an inception block described in v1, sections 'a' (for naive version), or 'b' (with dimension reduction) Each version has 4 verticals in their structure. See the link above. For all versions, verticals 1 and 2 of the block start with 2D convolution, which: reduces the number of input filters to next convolutions (to make computation cheaper) uses (1, 1) kernels, no Normalization is NOT normalized is followed by specified activation For all versions, verticals 1, 2, 3: the final convolution layer is not normalised and not activated since it will be dene after concatenation Vertical 4 is just a Conv2D. Its gets normalized and activated after being concatenated with outputs of other verticals. The concatenated output of the verticals is normalised and then activated with a given activation :param inputs: Input 4D tensor (samples, rows, cols, channels) :param filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the convolution). :param activation: string, specifies activation function to use everywhere in the block :param version: version of inception block, one of 'a', 'b' (case sensitive) :param pars: dictionary of parameters passed to u-net, determines the version, if this type of block is chosen :param allowed_pars: dictionary of all allowed to be passed to u-net parameters :return: 4D tensor (samples, rows, cols, channels) output of an inception block, given inputs """""" assert filters % 16 == 0 # checking that the allowed version names did not change in ALLOWED_PARS if allowed_pars != {}: assert allowed_pars.get('information_block').get('inception').get('v1') == ['a', 'b'] # keep version argument if need to use without PARS assert version in ['a', 'b'] # setting the version from pars if pars.get('information_block').get('inception').get('v1') is not None: version = pars.get('information_block').get('inception').get('v1') assert activation in ['relu', 'elu', None] # actv is a function, not a string, like activation actv = activation == 'relu' and (lambda: LeakyReLU(0.0)) or activation == 'elu' and (lambda: ELU(1.0)) or None # vertical 1 if version == 'a': c1 = Conv2D(filters=filters // 8, kernel_size=(5, 5), padding='same', kernel_initializer='he_normal')(inputs) else: c1_1 = Conv2D(filters=filters // 16, kernel_size=(1, 1), padding='same', activation=activation, kernel_initializer='he_normal')(inputs) c1 = Conv2D(filters=filters // 8, kernel_size=(5, 5), padding='same', kernel_initializer='he_normal')(c1_1) # vertical 2 if version == 'a': c2 = Conv2D(filters=filters // 2, kernel_size=(3, 3), padding='same', kernel_initializer='he_normal')(inputs) else: c2_1 = Conv2D(filters=filters // 8 * 3, kernel_size=(1, 1), padding='same', activation=activation, kernel_initializer='he_normal')(inputs) c2 = Conv2D(filters=filters // 2, kernel_size=(3, 3), padding='same', kernel_initializer='he_normal')(c2_1) # vertical 3 p3_1 = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='same')(inputs) if version == 'b': c3 = Conv2D(filters=filters // 8, kernel_size=(1, 1), padding='same', kernel_initializer='he_normal')(p3_1) else: c3 = p3_1 # vertical 4 c4_1 = Conv2D(filters=filters // 4, kernel_size=(1, 1), padding='same', kernel_initializer='he_normal')(inputs) c4 = c4_1 # concatenating verticals together, normalizing and applying activation result = concatenate([c1, c2, c3, c4], axis=3) result = BatchNormalization(axis=3)(result) result = actv()(result) return result def inception_block_v2(inputs, filters, activation=None, version='b', pars={}, allowed_pars={}): """"""Create a version of v1 inception block described in: https://towardsdatascience.com/a-simple-guide-to-the-versions-of-the-inception-network-7fc52b863202 Create an inception block described in v2, sections 'a', 'b', or 'c' Each version has 4 verticals in their structure. See the link above. For all versions, verticals 1 and 2 of the block start with 2D convolution, which: reduces the number of input filters to next convolutions (to make computation cheaper) uses (1, 1) kernels, no Normalization is NOT normalized is followed by specified activation For all versions, verticals 1, 2, 3: the middle convolutions use NConv2D with given activation, see its docstring the final convolution layer is not normalised and not activated since it will be dene after concatenation Vertical 4 is just a Conv2D. Its gets normalized and activated after being concatenated with outputs of other verticals. The concatenated output of the verticals is normalised and then activated with a given activation :param inputs: Input 4D tensor (samples, rows, cols, channels) :param filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the convolution). :param activation: string, specifies activation function to use everywhere in the block :param version: version of inception block, one of 'a', 'b', 'c' (case sensitive) :param pars: dictionary of parameters passed to u-net, determines the version, if this type of block is chosen :param allowed_pars: dictionary of all allowed to be passed to u-net parameters :return: 4D tensor (samples, rows, cols, channels) output of an inception block, given inputs """""" assert filters % 16 == 0 # checking that the allowed version names did not change in ALLOWED_PARS if allowed_pars != {}: assert allowed_pars.get('information_block').get('inception').get('v2') == ['a', 'b', 'c'] # keep version argument if need to use without PARS assert version in ['a', 'b', 'c'] # setting the version from pars if pars.get('information_block').get('inception').get('v2') is not None: version = pars.get('information_block').get('inception').get('v2') assert activation in ['relu', 'elu', None] # actv is a function, not a string, like activation actv = activation == 'relu' and (lambda: LeakyReLU(0.0)) or activation == 'elu' and (lambda: ELU(1.0)) or None # vertical 1 c1_1 = Conv2D(filters=filters // 16, kernel_size=(1, 1), padding='same', activation=activation, kernel_initializer='he_normal')(inputs) if version == 'a': c1_2 = NConv2D(filters=filters // 8, kernel_size=3, padding='same', activation=activation, kernel_initializer='he_normal')(c1_1) c1 = Conv2D(filters=filters // 8, kernel_size=3, padding='same', kernel_initializer='he_normal')(c1_2) elif version == 'b': c1_2 = NConv2D(filters=filters // 8, kernel_size=(1, 3), padding='same', activation=activation, kernel_initializer='he_normal')(c1_1) c1_3 = NConv2D(filters=filters // 8, kernel_size=(3, 1), padding='same', activation=activation, kernel_initializer='he_normal')(c1_2) c1_4 = NConv2D(filters=filters // 8, kernel_size=(1, 3), padding='same', activation=activation, kernel_initializer='he_normal')(c1_3) c1 = Conv2D(filters=filters // 8, kernel_size=(3, 1), padding='same', kernel_initializer='he_normal')(c1_4) else: c1_2 = NConv2D(filters=filters // 8, kernel_size=(1, 3), padding='same', activation=activation, kernel_initializer='he_normal')(c1_1) c1_3 = NConv2D(filters=filters // 8, kernel_size=3, padding='same', activation=activation, kernel_initializer='he_normal')(c1_2) c1_41 = Conv2D(filters=filters // 8, kernel_size=(1, 3), padding='same', kernel_initializer='he_normal')(c1_3) c1_42 = Conv2D(filters=filters // 8, kernel_size=(3, 1), padding='same', kernel_initializer='he_normal')(c1_3) c1 = concatenate([c1_41, c1_42], axis=3) # vertical 2 c2_1 = Conv2D(filters=filters // 8 * 3, kernel_size=(1, 1), padding='same', activation=activation, kernel_initializer='he_normal')(inputs) if version == 'a': c2 = Conv2D(filters=filters // 2, kernel_size=(3, 3), padding='same', kernel_initializer='he_normal')(c2_1) elif version == 'b': c2_2 = NConv2D(filters=filters // 2, kernel_size=(1, 3), padding='same', activation=activation, kernel_initializer='he_normal')(c2_1) c2 = Conv2D(filters=filters // 2, kernel_size=(3, 1), padding='same', kernel_initializer='he_normal')(c2_2) else: c2_21 = Conv2D(filters=filters // 2, kernel_size=(1, 3), padding='same', kernel_initializer='he_normal')(c2_1) c2_22 = Conv2D(filters=filters // 2, kernel_size=(3, 1), padding='same', kernel_initializer='he_normal')(c2_1) c2 = concatenate([c2_21, c2_22], axis=3) # vertical 3 p3_1 = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='same')(inputs) c3 = Conv2D(filters=filters // 8, kernel_size=(1, 1), padding='same', kernel_initializer='he_normal')(p3_1) # vertical 4 c4 = Conv2D(filters=filters // 4, kernel_size=(1, 1), padding='same', kernel_initializer='he_normal')(inputs) # concatenating verticals together, normalizing and applying activation result = concatenate([c1, c2, c3, c4], axis=3) result = BatchNormalization(axis=3)(result) result = actv()(result) return result def inception_block_et(inputs, filters, activation='relu', version='b', pars={}, allowed_pars={}): """"""Create an inception block with 2 options. For intuition read, parts v1 and v2: https://towardsdatascience.com/a-simple-guide-to-the-versions-of-the-inception-network-7fc52b863202 Each version/option has 4 verticals in their structure. See the link above. Default option: version='b' Create an inception block close to one described in v2, but keeps 5 as a factor for some convolutions Alternative option: version='a' Create an inception block described in v1, section Function author Edward Tyantov. That's why the name: inception_block_et. My modifications use version='a' instead of split=False use version='b' instead of split=True change default to version='b', aka split=True swap: Conv2D -> BatchNormalization -> activation to: NConv2D blocks. See NConv2D documentation for them. swap: Conv2D -> activation to: Conv2D -> Conv2D(activation=activation) change the order of the verticals to coincide with v2_paper notation change names of the outputs of the block verticals to c1, c2, c3, c4 use 'result' instead of 'res' to avoid confusion with residuals :param inputs: Input 4D tensor (samples, rows, cols, channels) :param filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the convolution). :param activation: activation function to use everywhere in the block :param version: version of inception block :param pars: dictionary of parameters passed to u-net, determines the version, if this type of block is chosen :param allowed_pars: dictionary of all allowed to be passed to u-net parameters :return: 4D tensor (samples, rows, cols, channels) output of an inception block, given inputs """""" assert filters % 16 == 0 # checking that the allowed version names did not change in ALLOWED_PARS if allowed_pars != {}: assert allowed_pars.get('information_block').get('inception').get('et') == ['a', 'b'] # keep version argument if need to use without PARS assert version in ['a', 'b'] # setting the version from pars if pars.get('information_block').get('inception').get('et') is not None: version = pars.get('information_block').get('inception').get('et') assert activation in ['relu', 'elu', None] # actv is a function, not a string, like activation actv = activation == 'relu' and (lambda: LeakyReLU(0.0)) or activation == 'elu' and (lambda: ELU(1.0)) or None # vertical 1 c1_1 = Conv2D(filters=filters // 16, kernel_size=(1, 1), padding='same', activation=activation, kernel_initializer='he_normal')(inputs) if version == 'b': c1_2 = NConv2D(filters=filters // 8, kernel_size=(1, 5), padding='same', activation=activation, kernel_initializer='he_normal')(c1_1) c1 = Conv2D(filters=filters // 8, kernel_size=(5, 1), kernel_initializer='he_normal', padding='same')(c1_2) else: c1 = Conv2D(filters=filters // 8, kernel_size=(5, 5), kernel_initializer='he_normal', padding='same')(c1_1) # vertical 2 c2_1 = Conv2D(filters=filters // 8 * 3, kernel_size=(1, 1), padding='same', activation=activation, kernel_initializer='he_normal')(inputs) if version == 'b': c2_2 = NConv2D(filters=filters // 2, kernel_size=(1, 3), padding='same', activation=activation, kernel_initializer='he_normal')(c2_1) c2 = Conv2D(filters=filters // 2, kernel_size=(3, 1), kernel_initializer='he_normal', padding='same')(c2_2) else: c2 = Conv2D(filters=filters // 2, kernel_size=(3, 3), kernel_initializer='he_normal', padding='same')(c2_1) # vertical 3 p3_1 = MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='same')(inputs) c3 = Conv2D(filters=filters // 8, kernel_size=(1, 1), padding='same', kernel_initializer='he_normal')(p3_1) # vertical 4 c4 = Conv2D(filters=filters // 4, kernel_size=(1, 1), padding='same', kernel_initializer='he_normal')(inputs) # concatenating verticals together, normalizing and applying activation result = concatenate([c1, c2, c3, c4], axis=3) result = BatchNormalization(axis=3)(result) result = actv()(result) return result # ====================================================================================================================== # Combining blocks, allowing to use different blocks from before # ====================================================================================================================== def pooling_block(inputs, filters, kernel_size=(3, 3), strides=(2, 2), padding='same', activation=None, pool_size=(2, 2), trainable=True, pars={}, allowed_pars={}): """"""Function returning the output of one of the pooling blocks. Allows not to make different versions of the u-net in terms of how pooling operation is performed: 1) trainable (default): through NConv2D custom function, see its documentation 2) non-trainable (alternative): through MaxPooling operation To get the expected behaviour when changing 'trainable' assert strides == pool_size Parameters starting with p_ are only to be used for (trainable=False) MaxPooling2D Parameters starting with c_ are only to be used for (trainable=True) MaxPooling2D :param inputs: 4D tensor (samples, rows, cols, channels) :param filters: NConv2D argument, filters :param kernel_size: NConv2D argument, kernel_size :param strides: NConv2D argument, strides :param padding: NConv2D/MaxPooling2D argument, padding :param activation: NConv2D argument, activation :param pool_size: MaxPooling2D argument, pool_size :param trainable: boolean specifying the version of a pooling block with default behaviour trainable=True: NConv2D(inputs._keras_shape[3], kernel_size=kernel_size, strides=strides, padding=padding)( inputs) trainable=False: MaxPooling2D(pool_size=pool_size)(inputs) :param pars: dictionary of parameters passed to u-net, determines the version of the block :param allowed_pars: dictionary of all allowed to be passed to u-net parameters :return: 4D tensor (samples, rows, cols, channels) output of a pooling block """""" # checking that the allowed trainable parameters did not change in ALLOWED_PARS if allowed_pars != {}: assert allowed_pars.get('pooling_block').get('trainable') == [True, False] # keep trainable argument if need to use without PARS assert trainable in [True, False] # setting the version from pars if pars.get('pooling_block').get('trainable') is not None: trainable = pars.get('pooling_block').get('trainable') # returning block's output if trainable: return NConv2D(filters=filters, kernel_size=kernel_size, strides=strides, padding=padding, activation=activation)(inputs) else: return MaxPooling2D(pool_size=pool_size, padding=padding)(inputs) def information_block(inputs, filters, kernel_size=(3, 3), padding='valid', activation=None, block='inception', block_type='v2', version='b', pars={}, allowed_pars={}): """"""Function returning the output of one of the information blocks. :param inputs: 4D tensor (samples, rows, cols, channels) :param filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the convolution). :param kernel_size: An integer or tuple/list of 2 integers, specifying the height and width of the 2D convolution window. Can be a single integer to specify the same value for all spatial dimensions. :param padding: one of 'valid' or 'same' (case-insensitive), 'valid' by default to have the same as Conv2D :param activation: string, specifies activation function to use everywhere in the block Next 3 parameters are there to be able to leave 'pars' and 'allowed_pars' empty :param block: one of 'inception' or 'convolution' (case-sensitive) :param block_type: if block == 'inception', one of 'v1', 'v2', 'et' (case-sensitive) if block == 'convolution': one of 'simple', 'dilated' (case-sensitive) :param version: version of a block to use :param pars: dictionary of parameters passed to u-net, determines the version of the block :param allowed_pars: dictionary of all allowed to be passed to u-net parameters :return: 4D tensor (samples, rows, cols, channels) output of a information block """""" # getting which block, block_type, version to use as the information block if pars.get('information_block') is not None: block = list(pars.get('information_block').keys())[0] block_type = list(pars.get('information_block').get(block).keys())[0] version = pars.get('information_block').get(block).get(block_type) # inception block if block == 'inception': if block_type == 'v1': return inception_block_v1(inputs=inputs, filters=filters, activation=activation, version=version, pars=pars, allowed_pars=allowed_pars) elif block_type == 'v2': return inception_block_v2(inputs=inputs, filters=filters, activation=activation, version=version, pars=pars, allowed_pars=allowed_pars) else: return inception_block_et(inputs=inputs, filters=filters, activation=activation, version=version, pars=pars, allowed_pars=allowed_pars) # convolution block else: if block_type == 'simple': return convolution_block(inputs=inputs, filters=filters, kernel_size=kernel_size, padding=padding, activation=activation, version=version, pars=pars, allowed_pars=allowed_pars) else: return dilated_convolution_block(inputs=inputs, filters=filters, kernel_size=kernel_size, padding=padding, activation=activation, version=version, pars=pars, allowed_pars=allowed_pars) def connection_block(inputs, filters, padding='valid', activation=None, version='residual', pars={}, allowed_pars={}): """"""Function returning the output of one of the connection block. :param inputs: 4D tensor (samples, rows, cols, channels) :param filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the convolution). :param padding: one of 'valid' or 'same' (case-insensitive), 'valid' by default to have the same as Conv2D :param activation: string, one of 'elu' or 'relu' or None (case-sensitive), specifies activation function to use everywhere in the block Version parameter is there to be able to leave 'pars' and 'allowed_pars' empty :param version: one of 'not_residual' or 'residual', version of a block to use :param pars: dictionary of parameters passed to u-net, determines the version of the block :param allowed_pars: dictionary of all allowed to be passed to u-net parameters :return: 4D tensor (samples, rows, cols, channels) output of a connection block """""" # checking that the allowed trainable parameters did not change in ALLOWED_PARS if allowed_pars != {}: assert allowed_pars.get('connection_block') == ['not_residual', 'residual'] # keep trainable argument if need to use without PARS assert version in ['not_residual', 'residual'] # setting the version from pars if pars.get('connection_block') is not None: version = pars.get('connection_block') if version == 'residual': return rblock(inputs=inputs, filters=32, kernel_size=(1, 1), padding='same', activation=activation) else: return Conv2D(filters=filters, kernel_size=(2, 2), padding=padding, kernel_initializer='he_normal')(inputs) '",Yes,5,84.0 execute_u_model = False,No,5,77.0 "######################################################################################################################## # ====================================================================================================================== # u_model # ====================================================================================================================== ######################################################################################################################## # needed for train # standard-module imports import numpy as np from keras.layers import Input, concatenate, Conv2D, UpSampling2D, Dense from keras.layers import Dropout, Flatten from keras.models import Model from keras.optimizers import Adam from keras import backend as K # # separate-module imports # from metric import dice_coef, dice_coef_loss # from u_model_blocks import pooling_block, connection_block, information_block # from configuration import ALLOWED_PARS, PARS IMG_ROWS, IMG_COLS = 80, 112 K.set_image_data_format('channels_last') # (number of images, rows per image, cols per image, channels) # ====================================================================================================================== # U-net with Inception blocks, Normalised 2D Convolutions instead of Maxpooling # ====================================================================================================================== def get_unet_customised(optimizer, pars=PARS, allowed_pars=ALLOWED_PARS): """""" Creating and compiling the U-net This version is fully customisable by choosing pars argument :param optimizer: specifies the optimiser for the u-net, e.g. Adam, RMSProp, etc. :param pars: optional, dictionary of parameters passed to customise the U-net :param allowed_pars: optional, dictionary of parameters allowed to be passed to customise the U-net :return: compiled u-net, Keras.Model object """""" # string, activation function activation = pars.get('activation') # input inputs = Input((IMG_ROWS, IMG_COLS, 1), name='main_input') print('inputs:', inputs._keras_shape) # # down the U-net # conv1 = information_block(inputs, 32, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars) print('conv1', conv1._keras_shape) pool1 = pooling_block(inputs=conv1, filters=32, activation=activation, pars=pars, allowed_pars=allowed_pars) print('pool1', pool1._keras_shape) pool1 = Dropout(0.5)(pool1) print('pool1', pool1._keras_shape) conv2 = information_block(pool1, 64, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars) print('conv2', conv2._keras_shape) pool2 = pooling_block(inputs=conv2, filters=64, activation=activation, pars=pars, allowed_pars=allowed_pars) print('pool2', pool2._keras_shape) pool2 = Dropout(0.5)(pool2) print('pool2', pool2._keras_shape) conv3 = information_block(pool2, 128, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars) print('conv3', conv3._keras_shape) pool3 = pooling_block(inputs=conv3, filters=128, activation=activation, pars=pars, allowed_pars=allowed_pars) print('pool3', pool3._keras_shape) pool3 = Dropout(0.5)(pool3) print('pool3', pool3._keras_shape) conv4 = information_block(pool3, 256, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars) print('conv4', conv4._keras_shape) pool4 = pooling_block(inputs=conv4, filters=256, activation=activation, pars=pars, allowed_pars=allowed_pars) print('pool4', pool4._keras_shape) pool4 = Dropout(0.5)(pool4) print('pool4', pool4._keras_shape) # # bottom level of the U-net # conv5 = information_block(pool4, 512, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars) print('conv5', conv5._keras_shape) conv5 = Dropout(0.5)(conv5) print('conv5', conv5._keras_shape) # # auxiliary output for predicting probability of nerve presence # if pars['outputs'] == 2: pre = Conv2D(1, kernel_size=(1, 1), kernel_initializer='he_normal', activation='sigmoid')(conv5) pre = Flatten()(pre) aux_out = Dense(1, activation='sigmoid', name='aux_output')(pre) # # up the U-net # after_conv4 = connection_block(conv4, 256, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars) print('after_conv4', after_conv4._keras_shape) up6 = concatenate([UpSampling2D(size=(2, 2))(conv5), after_conv4], axis=3) conv6 = information_block(up6, 256, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars) print('conv6', conv6._keras_shape) conv6 = Dropout(0.5)(conv6) print('conv6', conv6._keras_shape) after_conv3 = connection_block(conv3, 128, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars) print('after_conv3', after_conv3._keras_shape) up7 = concatenate([UpSampling2D(size=(2, 2))(conv6), after_conv3], axis=3) conv7 = information_block(up7, 128, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars) print('conv7', conv7._keras_shape) conv7 = Dropout(0.5)(conv7) print('conv7', conv7._keras_shape) after_conv2 = connection_block(conv2, 64, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars) print('after_conv2', after_conv2._keras_shape) up8 = concatenate([UpSampling2D(size=(2, 2))(conv7), after_conv2], axis=3) conv8 = information_block(up8, 64, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars) print('conv8', conv8._keras_shape) conv8 = Dropout(0.5)(conv8) print('conv8', conv8._keras_shape) after_conv1 = connection_block(conv1, 32, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars) print('after_conv1', after_conv1._keras_shape) up9 = concatenate([UpSampling2D(size=(2, 2))(conv8), after_conv1], axis=3) conv9 = information_block(up9, 32, padding='same', activation=activation, pars=pars, allowed_pars=allowed_pars) print('conv9', conv9._keras_shape) conv9 = Dropout(0.5)(conv9) print('conv9', conv9._keras_shape) # main output conv10 = Conv2D(1, kernel_size=(1, 1), kernel_initializer='he_normal', activation='sigmoid', name='main_output')( conv9) print('conv10', conv10._keras_shape) # creating a model # compiling the model if pars['outputs'] == 1: model = Model(inputs=inputs, outputs=conv10) model.compile(optimizer=optimizer, loss={'main_output': dice_coef_loss}, metrics={'main_output': dice_coef}) else: model = Model(inputs=inputs, outputs=[conv10, aux_out]) model.compile(optimizer=optimizer, loss={'main_output': dice_coef_loss, 'aux_output': 'binary_crossentropy'}, metrics={'main_output': dice_coef, 'aux_output': 'acc'}, loss_weights={'main_output': 1., 'aux_output': 0.5}) return model # ---------------------------------------------------------------------------------------------------------------------- # get_unet() allows to try other versions of the u-net, if more are specified get_unet = get_unet_customised if __name__ == '__main__' and (execute_u_model==True or execute_all==True): # test the u-net without training img_rows = IMG_ROWS img_cols = IMG_COLS # to check that model works without training, any kind of optimiser can be used model = get_unet(Adam(lr=1e-5), pars=PARS) x = np.random.random((1, img_rows, img_cols, 1)) result = model.predict(x, 1) print(result) print('params', model.count_params()) print('layer num', len(model.layers))'",Yes,5,53.0 execute_train = False,No,5,77.0 "######################################################################################################################## # ====================================================================================================================== # train # ====================================================================================================================== ######################################################################################################################## # standard-module imports import numpy as np from skimage.transform import resize from keras.callbacks import ModelCheckpoint, EarlyStopping # # separate-module imports # from u_model import get_unet, IMG_COLS as img_cols, IMG_ROWS as img_rows # from data import load_train_data, load_test_data, load_nerve_presence # from configuration import PARS, OPTIMIZER def preprocess(imgs, to_rows=None, to_cols=None): """"""Resize all images in a 4D tensor of images of the shape (samples, rows, cols, channels). :param imgs: a 4D tensor of images of the shape (samples, rows, cols, channels) :param to_rows: new number of rows for images to be resized to :param to_cols: new number of rows for images to be resized to :return: a 4D tensor of images of the shape (samples, to_rows, to_cols, channels) """""" if to_rows is None or to_cols is None: to_rows = img_rows to_cols = img_cols print(imgs.shape) imgs_p = np.ndarray((imgs.shape[0], to_rows, to_cols, imgs.shape[3]), dtype=np.uint8) for i in range(imgs.shape[0]): imgs_p[i, :, :, 0] = resize(imgs[i, :, :, 0], (to_rows, to_cols), preserve_range=True) return imgs_p def train_and_predict(): print('-' * 30) print('Loading and preprocessing train data...') print('-' * 30) imgs_train, imgs_mask_train = load_train_data() imgs_present = load_nerve_presence() imgs_train = preprocess(imgs_train) imgs_mask_train = preprocess(imgs_mask_train) # centering and standardising the images imgs_train = imgs_train.astype('float32') mean = np.mean(imgs_train) std = np.std(imgs_train) imgs_train -= mean imgs_train /= std imgs_mask_train = imgs_mask_train.astype('float32') imgs_mask_train /= 255. # scale masks to be in {0, 1} instead of {0, 255} print('-' * 30) print('Creating and compiling model...') print('-' * 30) # load model - the Learning rate scheduler choice is most important here model = get_unet(optimizer=OPTIMIZER, pars=PARS) model_checkpoint = ModelCheckpoint('weights.h5', monitor='val_loss', save_best_only=True) early_stopping = EarlyStopping(patience=5, verbose=1) print('-' * 30) print('Fitting model...') print('-' * 30) if PARS['outputs'] == 1: imgs_labels = imgs_mask_train else: imgs_labels = [imgs_mask_train, imgs_present] model.fit(imgs_train, imgs_labels, batch_size=128, epochs=50, verbose=1, shuffle=True, validation_split=0.2, callbacks=[model_checkpoint, early_stopping]) print('-' * 30) print('Loading and preprocessing test data...') print('-' * 30) imgs_test = load_test_data() imgs_test = preprocess(imgs_test) imgs_test = imgs_test.astype('float32') imgs_test -= mean imgs_test /= std print('-' * 30) print('Loading saved weights...') print('-' * 30) model.load_weights('weights.h5') print('-' * 30) print('Predicting masks on test data...') print('-' * 30) imgs_mask_test = model.predict(imgs_test, verbose=1) if PARS['outputs'] == 1: np.save('imgs_mask_test.npy', imgs_mask_test) else: np.save('imgs_mask_test.npy', imgs_mask_test[0]) np.save('imgs_mask_test_present.npy', imgs_mask_test[1]) # -------------------------------------------------------------------------------------------------------------------- if __name__ == '__main__' and (execute_train==True or execute_all==True): train_and_predict()'",Yes,5,53.0 execute_submission = False,No,5,77.0 "import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from nltk.tokenize import sent_tokenize, word_tokenize from nltk.tokenize import PunktSentenceTokenizer from nltk.tokenize import RegexpTokenizer from nltk.stem import PorterStemmer from nltk import pos_tag from sklearn.feature_extraction.text import TfidfVectorizer",No,5,22.0 "data = pd.read_csv(""../input/train.tsv"",delimiter='\\t') data.shape'",No,4,45.0 "adjectives = [] for i in range(0,data.shape[0]): pos_tagged = pos_tag(word_tokenize(data.iloc[i,2])) string = """" for j in range(0,len(pos_tagged)): if pos_tagged[j][1] in (""JJ"",""JJR"", ""JJS"", ""RB"", ""RBR"", ""RBS""): string = string + "" "" + pos_tagged[j][0] adjectives.append(string)",No,5,77.0 data['Adjective Review'] = adjectives,No,5,8.0 "data = data.drop(""PhraseId"",axis=1) data = data.drop(""SentenceId"",axis=1) data = data.drop(""Phrase"",axis=1) print(data.head())",No,4,10.0 "predictors = data['Adjective Review'] predictors.shape",No,5,58.0 "response = data[""Sentiment""] response.shape",No,5,58.0 "tv = TfidfVectorizer(min_df=0.0, max_df=1.0, ngram_range=(1,2),sublinear_tf=True,max_features=1000) tv_features = tv.fit_transform(predictors)",No,5,8.0 print(tv_features.shape),No,5,58.0 "train_predictors, test_predictors, train_response, test_response = train_test_split(tv_features, response, random_state = 0)",No,5,13.0 "print(train_predictors.shape) print(test_predictors.shape)",No,5,58.0 "from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(max_features = 10).fit(train_predictors, train_response)",No,5,7.0 predicted_test_response = model.predict(test_predictors),No,5,48.0 "accuracy_score(test_response, predicted_test_response)",No,5,49.0 "test = pd.read_csv(""../input/test.tsv"",delimiter='\\t') test.shape'",No,4,45.0 test['Adjective Review'] = test_adjectives,No,5,8.0 "test = test.drop(""SentenceId"",axis=1) test = test.drop(""Phrase"",axis=1) print(test.head())",No,4,10.0 "test_predictors = test['Adjective Review'] test_predictors.shape",No,5,58.0 "test_tv = TfidfVectorizer(min_df=0.0, max_df=1.0, ngram_range=(1,2),sublinear_tf=True,max_features=1000) test_tv_features = tv.fit_transform(test_predictors)",No,5,8.0 test_response = model.predict(test_tv_features),No,5,48.0 len(test_response),No,5,58.0 test['Sentiment'] = test_response,No,5,8.0 "test.head() test.shape",No,4,41.0 "test = test.drop(""Adjective Review"",axis=1)",No,5,10.0 "test.to_csv(""Submission.csv"", sep=',',index=False)'",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from keras.models import Sequential from keras.layers import CuDNNLSTM, Dropout, Dense,Conv1D, MaxPooling1D from keras.layers.embeddings import Embedding from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences import string import nltk import re from nltk.corpus import stopwords from nltk.stem.snowball import SnowballStemmer # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory import os print(os.listdir(""../input"")) # Any results you write to the current directory are saved as output.",No,5,88.0 "test_data = pd.read_csv(""../input/test.tsv"",delimiter='\\t') train_data = pd.read_csv(""../input/train.tsv"",delimiter=""\\t"")'",No,5,45.0 from keras.utils import np_utils,No,5,22.0 "model.fit(X, Y, epochs=10, validation_split=0.2)",No,5,7.0 Y_test = model.predict(X_test),No,5,48.0 "submission = pd.DataFrame({'PhraseId' : test_data[""PhraseId""], 'Sentiment' : Y_test})'",No,5,12.0 "submission.to_csv(""submission.csv"", index=False)",No,5,25.0 "b""train_df=pd.read_csv('../input/train.tsv',sep='\\t')\ntest_df=pd.read_csv('../input/test.tsv',sep='\\t')""",No,5,45.0 "train_df['token']=train_df.apply(tokenizer,axis=1) test_df['token']=test_df.apply(tokenizer,axis=1) ",No,5,8.0 "train_df['lemma']=train_df.apply(lemmatize,axis=1) test_df['lemma']=test_df.apply(lemmatize,axis=1)",No,5,78.0 "#Loading the Data train= pd.read_csv('../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip') test = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/test_users.csv.zip') ",No,5,45.0 train.signup_method.unique(),No,5,57.0 train.affiliate_channel.unique(),No,5,57.0 train.affiliate_provider.unique(),No,5,57.0 train.first_affiliate_tracked.unique(),No,5,57.0 train.first_device_type.unique(),No,5,57.0 train.first_browser.unique(),No,5,57.0 train.signup_app.unique(),No,5,57.0 countries.sort_values(by='distance_km'),No,5,9.0 "a=train['country_destination'].value_counts() a=a.drop(['NDF','other'],axis=0) df_value_counts = pd.DataFrame(a) df_value_counts = a.reset_index() df_value_counts.columns = ['country_destination', 'value_counts'] # change column names mapdata= pd.merge(df_value_counts, countries,how= 'inner' , on='country_destination') mapdata['poppercent']=mapdata['value_counts']/mapdata['value_counts'].sum() mapdata['text']=['United States', 'France', 'Italy','United Kingdom', 'Spanish','Canada' ,'German', 'Dutch','Australia', 'Brazil' ] ",Yes,3,12.0 "# Create a world map to show distributions of users import folium from folium.plugins import MarkerCluster #empty map world_map= folium.Map(tiles=""cartodbpositron"") marker_cluster = MarkerCluster().add_to(world_map) #for each coordinate, create circlemarker of user percent for i in range(len(mapdata)): lat = mapdata.iloc[i]['lat_destination'] long = mapdata.iloc[i]['lng_destination'] radius=5 popup_text = """"""Country : {}
%of Users : {}
"""""" popup_text = popup_text.format(mapdata.iloc[i]['country_destination'], mapdata.iloc[i]['poppercent'] ) folium.CircleMarker(location = [lat, long], radius=radius, popup= popup_text, fill =True).add_to(marker_cluster) #show the map world_map'",Yes,5,33.0 "a=train.groupby('country_destination').count().sort_values(by='id',ascending=True) a.id ",Yes,5,9.0 countries.dtypes,No,5,70.0 "age_gender.drop(age_gender.index[0],inplace=True) age_gender",No,5,10.0 age_gender.country_destination.unique(),No,5,57.0 age_gender.age_bucket.unique(),No,5,57.0 "agebtw0n19= age_gender[(age_gender.age_bucket=='0-4') | (age_gender.age_bucket=='5-9') | (age_gender.age_bucket=='10-14')|(age_gender.age_bucket=='15-19')] agebtw49n34= age_gender[(age_gender.age_bucket=='45-49') | (age_gender.age_bucket=='40-44') | (age_gender.age_bucket=='35-39') | (age_gender.age_bucket=='30-34')] agebtw99n50= age_gender[(age_gender.age_bucket=='95-99') | (age_gender.age_bucket=='90-94') | (age_gender.age_bucket=='85-89') | (age_gender.age_bucket=='80-84') | (age_gender.age_bucket=='75-79') | (age_gender.age_bucket=='70-74') | (age_gender.age_bucket=='70-74') | (age_gender.age_bucket=='65-69') | (age_gender.age_bucket=='60-64') | (age_gender.age_bucket=='55-59') | (age_gender.age_bucket=='50-54') ] agebtw20n29= age_gender[(age_gender.age_bucket=='25-29') | (age_gender.age_bucket=='20-24')] ",No,4,13.0 "import seaborn as sns ax = sns.boxplot(x=agebtw20n29[""population_in_thousands""])",No,5,33.0 "import seaborn as sns ax = sns.boxplot(x=agebtw99n50[""population_in_thousands""])",No,5,33.0 "plt.figure(figsize=(20,5)) import seaborn as sns age_gender.sort_values(""age_bucket"", ascending=False,inplace=True) colors=sns.color_palette() sns.stripplot(x=""age_bucket"",y=""population_in_thousands"",data=age_gender,jitter=True,hue='country_destination',palette='pastel')'",No,5,81.0 "actionanditsdetail=pd.crosstab(age_gender.age_bucket, age_gender.country_destination,margins=False) from scipy.stats import chi2_contingency # defining the table stat, p, dof, expected = chi2_contingency(actionanditsdetail) # interpret p-value alpha = 0.05 print(""p value is "" + str(p)) if p <= alpha: \tprint('Dependent (reject H0)') else: \tprint('Independent (H0 holds true)')'",Yes,5,47.0 "actionanditsdetail=pd.crosstab(age_gender.country_destination, age_gender.gender,margins=False) from scipy.stats import chi2_contingency # defining the table stat, p, dof, expected = chi2_contingency(actionanditsdetail) # interpret p-value alpha = 0.05 print(""p value is "" + str(p)) if p <= alpha: \tprint('Dependent (reject H0)') else: \tprint('Independent (H0 holds true)') '",Yes,5,47.0 "sessions=sessions.sort_values('user_id') sessions.rename(columns={""user_id"": ""id""},inplace=True) sessions.sort_values(by='id',inplace=True) sessions = sessions[(sessions['secs_elapsed'].notnull()) & (sessions['secs_elapsed'] > 0.0) ] sessions[(sessions['action_detail']=='booking')]'",Yes,3,9.0 "groupedid=pd.DataFrame(sessions.groupby(['id'])['secs_elapsed'].sum()) groupedid['hour_elapsed']=groupedid['secs_elapsed'].div(3600).round(decimals=0) groupedid.drop('secs_elapsed',axis=1,inplace=True) groupedid.reset_index(inplace=True) groupedid[(groupedid['id'] == '6udv3scuxe') | (groupedid['id'] == 'yxf0sm9sbw') | (groupedid['id'] == 'nttj7g9av6')] ",Yes,4,12.0 groupedid.hour_elapsed.describe(),No,5,40.0 "actionndevicetype=pd.crosstab(sessions.action, sessions.device_type,margins=False) from scipy.stats import chi2_contingency # defining the table stat, p, dof, expected = chi2_contingency(actionndevicetype) # interpret p-value alpha = 0.05 print(""p value is "" + str(p)) if p <= alpha: \tprint('Dependent (reject H0)') else: \tprint('Independent (H0 holds true)')'",Yes,5,47.0 "import pandas as pd import matplotlib.pyplot as plt import seaborn as sns print(train.country_destination.value_counts()) sns.countplot(train.country_destination)",Yes,3,72.0 "import pandas as pd #Loading the Data again train= pd.read_csv('../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip') test = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/test_users.csv.zip') ",Yes,5,45.0 "#missing data total = train.isnull().sum().sort_values(ascending=False) percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False) missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) missing_data.head(20)",Yes,2,11.0 "train['first_affiliate_tracked'] = train['first_affiliate_tracked'].fillna('Unknown') test['first_affiliate_tracked'] = test['first_affiliate_tracked'].fillna('Unknown')",No,5,17.0 "import seaborn as sns sns.boxplot(x=train.age)",Yes,5,33.0 train.age.describe(),No,5,40.0 "b""import numpy as np\n#train=train[(train.age > 14) & (train.age < 110)]# modele sadece 14 yandan byk ve 110 yandan kkleri dahil edebilirim.\n#test=test[(test.age > 14) & (test.age < 110)]\ntrain[(train.age < 14) & (train.age > 110)]=np.nan\ntest[(test.age < 14) & (test.age > 110)]=np.nan\ntrain['age'].fillna(train['age'].mean(), inplace=True)\ntest['age'].fillna(train['age'].mean(), inplace=True)""",Yes,5,17.0 "#Converting below columns as categories for plotting in graphs categorical_features = [ 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'first_browser', 'first_device_type', 'gender', 'language', 'signup_app', 'signup_method', 'signup_flow' ] for categorical_feature in categorical_features: train[categorical_feature] = train[categorical_feature].astype('category') for categorical_feature in categorical_features: test[categorical_feature] = test[categorical_feature].astype('category') ",No,5,16.0 "train['date_account_created'] = pd.to_datetime(train['date_account_created']) train['date_first_booking'] = pd.to_datetime(train['date_first_booking']) train['timestamp_first_active'] = pd.to_datetime(train['timestamp_first_active'], format='%Y%m%d%H%M%S') train['timestamp_first_active'] = pd.to_datetime(train['timestamp_first_active']).dt.date test['date_account_created'] = pd.to_datetime(test['date_account_created']) test['timestamp_first_active'] = pd.to_datetime(test['timestamp_first_active'], format='%Y%m%d%H%M%S') test['timestamp_first_active'] = pd.to_datetime(test['timestamp_first_active']).dt.date ",No,5,16.0 "import matplotlib.pyplot as plt import seaborn as sns # Use seaborn style defaults and set the default figure size df = train.groupby(['date_first_booking'])['country_destination'].count().reset_index() df.dropna(axis=0,inplace=True) import plotly.express as px fig = px.line(df, x='date_first_booking', y=""country_destination"") fig.show() '",Yes,5,81.0 "train.drop(['date_first_booking'], axis=1,inplace=True) test.drop(['date_first_booking'], axis=1,inplace=True)",No,5,10.0 "#date_account_created dac = np.vstack(train.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values) train['dac_year'] = dac[:,0] train['dac_month'] = dac[:,1] train['dac_day'] = dac[:,2] train.drop(['date_account_created'], axis=1,inplace=True) #timestamp_first_active tfa = np.vstack(train.timestamp_first_active.astype(str).apply(lambda x: list(map(int, x.split('-')))).values) train['tfa_year'] = tfa[:,0] train['tfa_month'] = tfa[:,1] train['tfa_day'] = tfa[:,2] train.drop(['timestamp_first_active'], axis=1,inplace=True) #date_account_created dac = np.vstack(test.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values) test['dac_year'] = dac[:,0] test['dac_month'] = dac[:,1] test['dac_day'] = dac[:,2] test.drop(['date_account_created'], axis=1,inplace=True) #timestamp_first_active tfa = np.vstack(test.timestamp_first_active.astype(str).apply(lambda x: list(map(int, x.split('-')))).values) test['tfa_year'] = tfa[:,0] test['tfa_month'] = tfa[:,1] test['tfa_day'] = tfa[:,2] test.drop(['timestamp_first_active'], axis=1,inplace=True)",No,4,78.0 "from sklearn.preprocessing import LabelEncoder le = LabelEncoder() train['gender']= le.fit_transform(train['gender']) train['signup_method']= le.fit_transform(train['signup_method']) train['first_affiliate_tracked']= le.fit_transform(train['first_affiliate_tracked']) train['signup_method']= le.fit_transform(train['signup_method']) train['language']= le.fit_transform(train['language']) train['affiliate_channel']= le.fit_transform(train['affiliate_channel']) train['affiliate_provider']= le.fit_transform(train['affiliate_provider']) train['signup_app']= le.fit_transform(train['signup_app']) train['first_device_type']= le.fit_transform(train['first_device_type']) train['first_browser']= le.fit_transform(train['first_browser']) train['signup_flow']= le.fit_transform(train['signup_flow'])",Yes,5,20.0 "le = LabelEncoder() test['gender']= le.fit_transform(test['gender']) test['signup_method']= le.fit_transform(test['signup_method']) test['first_affiliate_tracked']= le.fit_transform(test['first_affiliate_tracked']) test['signup_method']= le.fit_transform(test['signup_method']) test['language']= le.fit_transform(test['language']) test['affiliate_channel']= le.fit_transform(test['affiliate_channel']) test['affiliate_provider']= le.fit_transform(test['affiliate_provider']) test['signup_app']= le.fit_transform(test['signup_app']) test['first_device_type']= le.fit_transform(test['first_device_type']) test['first_browser']= le.fit_transform(test['first_browser']) test['signup_flow']= le.fit_transform(test['signup_flow'])",Yes,5,20.0 "train.country_destination.replace('NDF',0,inplace=True) train.country_destination.replace('US',1,inplace=True) train.country_destination.replace('other',2,inplace=True) train.country_destination.replace('FR',3,inplace=True) train.country_destination.replace('CA',4,inplace=True) train.country_destination.replace('GB',5,inplace=True) train.country_destination.replace('ES',6,inplace=True) train.country_destination.replace('IT',7,inplace=True) train.country_destination.replace('PT',8,inplace=True) train.country_destination.replace('NL',9,inplace=True) train.country_destination.replace('DE',10,inplace=True) train.country_destination.replace('AU',11,inplace=True)",No,5,20.0 "from sklearn.model_selection import train_test_split y=train['country_destination'] X=train.drop(['country_destination','id'],axis=1) from imblearn.combine import SMOTETomek # transform the dataset smotetomek = SMOTETomek(sampling_strategy='auto') # split the dataset into train and test sets X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.30, random_state=1,shuffle=True,stratify=y) X_train, y_train = smotetomek.fit_resample(X_train1, y_train1)",Yes,4,13.0 "x=pd.DataFrame(X_train) Y=pd.DataFrame(y_train) result = pd.concat([x, Y], axis=1, join='inner') sns.countplot(result.country_destination)",Yes,5,12.0 "target_names = ['NDF', 'US', 'other', 'FR', 'CA', 'GB', 'ES', 'IT', 'PT', 'NL','DE', 'AU'] #Classifier from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report rf=RandomForestClassifier() rf.fit(X_train,y_train) y_predrf=rf.predict(X_test1) print(classification_report(y_test1, y_predrf, target_names=target_names))",Yes,4,7.0 "import pandas as pd feature_imp = pd.Series(rf.feature_importances_,index=feature_names).sort_values(ascending=False) feature_imp ",Yes,5,79.0 "import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline # Creating a bar plot sns.barplot(x=feature_imp, y=feature_imp.index) # Add labels to your graph plt.xlabel('Feature Importance Score') plt.ylabel('Features') plt.title(""Visualizing Important Features"") plt.legend() plt.show()'",Yes,5,79.0 "pred_country={0:""NDF"", 1:""US"", 2:""other"", 3:""FR"", 4:""CA"", 5:""GB"", 6:""ES"", 7:""IT"", 8:""PT"", 9:""DE"", 10:""NL"", 11:""AU""}",No,3,77.0 "from xgboost.sklearn import XGBClassifier xgb = XGBClassifier() xgb.fit(X_train,y_train) y_predxgb=xgb.predict(X_test1) print(classification_report(y_test1, y_predxgb, target_names=target_names)) ",Yes,3,7.0 "predictionsxgb=xgb.predict(test.drop(['id'],axis=1))",No,5,48.0 "from sklearn.neural_network import MLPClassifier #Generate prediction using Neural Net mlp = MLPClassifier(activation='identity', solver='sgd',learning_rate='adaptive', alpha=0.0001, batch_size='auto') mlp.fit(X_train,y_train) predsmlp = mlp.predict(X_test1) from sklearn import metrics print(classification_report(y_test1, predsmlp, target_names=target_names))",Yes,3,7.0 "from sklearn.neural_network import MLPClassifier from sklearn.model_selection import GridSearchCV parameters = {'solver': ['lbfgs','sgd'], 'max_iter': [1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000 ], 'alpha': 10.0 ** -np.arange(1, 10), 'hidden_layer_sizes':np.arange(10, 15), 'random_state':[0,1,2,3,4,5,6,7,8,9]} mlpgridsearch = GridSearchCV(MLPClassifier(), parameters, n_jobs=-1) mlpgridsearch.fit(X_train,y_train) predsgridmlp = mlpgridsearch.predict(X_test1) from sklearn import metrics print(classification_report(y_test1, predsgridmlp, target_names=target_names))",Yes,4,6.0 "from sklearn.naive_bayes import ComplementNB cnb = ComplementNB() cnb.fit(X_train, y_train) y_predcnb=cnb.predict(X_test1) from sklearn import metrics print(classification_report(y_test1, y_predcnb, target_names=target_names))",Yes,4,49.0 "import pandas as pd import numpy as np from sklearn.preprocessing import LabelEncoder from xgboost.sklearn import XGBClassifier #Loading the Data again train= pd.read_csv('../input/airbnb-recruiting-new-user-bookings/train_users_2.csv.zip') test = pd.read_csv('../input/airbnb-recruiting-new-user-bookings/test_users.csv.zip') train['first_affiliate_tracked'] = train['first_affiliate_tracked'].fillna('Unknown') test['first_affiliate_tracked'] = test['first_affiliate_tracked'].fillna('Unknown') train['date_account_created'] = pd.to_datetime(train['date_account_created']) train['timestamp_first_active'] = pd.to_datetime(train['timestamp_first_active'], format='%Y%m%d%H%M%S') train['timestamp_first_active'] = pd.to_datetime(train['timestamp_first_active']).dt.date test['date_account_created'] = pd.to_datetime(test['date_account_created']) test['timestamp_first_active'] = pd.to_datetime(test['timestamp_first_active'], format='%Y%m%d%H%M%S') test['timestamp_first_active'] = pd.to_datetime(test['timestamp_first_active']).dt.date train.drop(['date_first_booking'], axis=1,inplace=True) test.drop(['date_first_booking'], axis=1,inplace=True) #date_account_created dac = np.vstack(train.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values) train['dac_year'] = dac[:,0] train['dac_month'] = dac[:,1] train['dac_day'] = dac[:,2] train.drop(['date_account_created'], axis=1,inplace=True) #timestamp_first_active tfa = np.vstack(train.timestamp_first_active.astype(str).apply(lambda x: list(map(int, x.split('-')))).values) train['tfa_year'] = tfa[:,0] train['tfa_month'] = tfa[:,1] train['tfa_day'] = tfa[:,2] train.drop(['timestamp_first_active'], axis=1,inplace=True) #date_account_created dac = np.vstack(test.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values) test['dac_year'] = dac[:,0] test['dac_month'] = dac[:,1] test['dac_day'] = dac[:,2] test.drop(['date_account_created'], axis=1,inplace=True) #timestamp_first_active tfa = np.vstack(test.timestamp_first_active.astype(str).apply(lambda x: list(map(int, x.split('-')))).values) test['tfa_year'] = tfa[:,0] test['tfa_month'] = tfa[:,1] test['tfa_day'] = tfa[:,2] test.drop(['timestamp_first_active'], axis=1,inplace=True) import numpy as np train[(train.age < 14) & (train.age > 110)]=np.nan train['age'].fillna(train['age'].mean(), inplace=True) test[(test.age < 14) & (test.age > 110)]=np.nan test['age'].fillna(train['age'].mean(), inplace=True) #Converting below columns as categories for plotting in graphs categorical_features = [ 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'first_browser', 'first_device_type', 'gender', 'language', 'signup_app', 'signup_method', 'signup_flow' ] for categorical_feature in categorical_features: train[categorical_feature] = train[categorical_feature].astype('category') for categorical_feature in categorical_features: test[categorical_feature] = test[categorical_feature].astype('category') #One-hot-encoding features ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser'] for f in ohe_feats: train_dummy = pd.get_dummies(train[f], prefix=f) train_cont= train.drop([f], axis=1) train = pd.concat((train_cont, train_dummy), axis=1) #One-hot-encoding features ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser'] for f in ohe_feats: test_dummy = pd.get_dummies(test[f], prefix=f) test_cont= test.drop([f], axis=1) test = pd.concat((test_cont, test_dummy), axis=1) #Splitting train and test from sklearn.model_selection import train_test_split y=train['country_destination'] X=train.drop(['country_destination','id'],axis=1) from imblearn.combine import SMOTETomek transform the dataset smotetomek = SMOTETomek(sampling_strategy='auto') split the dataset into train and test sets X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.30, random_state=1,shuffle=True) X_train, y_train = smotetomek.fit_resample(X_train1, y_train1) #Classifier xgb = XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints=None, learning_rate=0.3, max_delta_step=0, max_depth=6, min_child_weight=1, monotone_constraints=None, n_estimators=25, n_jobs=0, num_parallel_tree=1, objective='multi:softprob', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None, seed=0, subsample=0.5, tree_method=None, validate_parameters=False, verbosity=None) xgb.fit(X_train, y_train) target_names = ['NDF', 'US', 'other', 'FR', 'CA', 'GB', 'ES', 'IT', 'PT', 'NL','DE', 'AU'] y_predxgb=xgb.predict(X_test1) from sklearn.metrics import classification_report print(classification_report(y_test1, y_predxgb, target_names=target_names))",Yes,2,7.0 "import pandas as pd from bs4 import BeautifulSoup import nltk from nltk.corpus import stopwords import re import numpy as np import time from sklearn.ensemble import RandomForestClassifier from sklearn.feature_extraction.text import CountVectorizer",No,5,22.0 "train = pd.read_csv(""../input/train.tsv"", header = 0, delimiter = '\\t') test = pd.read_csv(""../input/test.tsv"", header = 0, delimiter = '\\t')'",No,5,45.0 "def phrase_to_words(raw_phrase): #remove any html phrase_text = BeautifulSoup(raw_phrase).get_text() #remove non letters letters = re.sub(""[^A-Za-z]"", "" "", phrase_text) #to lowercase lower_letters = letters.lower().split() #remove stopwords stop = set(stopwords.words('english')) meaningful_words = [word for word in lower_letters if word not in stop] return ("" "".join(meaningful_words))'",No,5,78.0 "#First the train set num_phrase = train['Phrase'].size clean_train_phrase = [] for i in range(0, num_phrase): if( (i+1)%10000 == 0 ): print (""Review %d of %d\ "" % ( i+1, num_phrase )) clean_train_phrase.append(phrase_to_words(train['Phrase'][i]))'",No,3,78.0 "print(""Making Bag of words"") vectorizer = CountVectorizer(analyzer = ""word"", \\ tokenizer = None, \\ preprocessor = None, \\ stop_words = None, \\ max_features = 5000) train_data_features = vectorizer.fit_transform(clean_train_phrase) # Numpy arrays are easy to work with, so convert the result to an # array train_data_features = train_data_features.toarray() '",No,5,8.0 "import time start = time.time() # Start time print(""Training the Random Forest"") from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=100, n_jobs = -1) rf = rf.fit(train_data_features, train['Sentiment']) # Get the end time and print how long the process took end = time.time() elapsed = end - start print (""Time taken for K Means clustering: "", elapsed, ""seconds."")'",Yes,5,7.0 test.columns.values,No,5,71.0 "num_test_phrase = test['Phrase'].size clean_test_phrase = [] for i in range(0, num_test_phrase): if( (i+1)%10000 == 0 ): print (""Review %d of %d\ "" % ( i+1, num_test_phrase )) clean_test_phrase.append(phrase_to_words(test['Phrase'][i]))'",No,4,78.0 "#Get a bag of words for the test set, and convert to a numpy array test_data_features = vectorizer.transform(clean_test_phrase) test_data_features = test_data_features.toarray() #Use the random forest to make sentiment label predictions result = rf.predict(test_data_features) # Copy the results to a pandas dataframe with an ""id"" column and # a ""sentiment"" column output = pd.DataFrame( data={""PhraseId"":test[""PhraseId""], ""Sentiment"":result} ) # Use pandas to write the comma-separated output file output.to_csv( ""Sentiment_Analysis_Movie.csv"", index=False, quoting=3 )",Yes,3,48.0 "train = pd.read_csv(""../input/train.tsv"", sep='\\t') test = pd.read_csv(""../input/test.tsv"", sep='\\t') #train = train[0:1000] train['Sentiment'] = train['Sentiment'].apply(str) '",Yes,4,45.0 "train.head() ",No,5,41.0 train.Sentiment.value_counts(),No,5,72.0 "import tensorflow as tf device_name = tf.test.gpu_device_name() if device_name != '/device:GPU:0': raise SystemError('GPU device not found') print('Found GPU at: {}'.format(device_name))",Yes,5,23.0 test['Phrase'][0],No,5,41.0 train['Sentiment'].unique(),No,5,57.0 !pip install pytorch-pretrained-bert pytorch-nlp,No,5,87.0 "import torch from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from keras.preprocessing.sequence import pad_sequences from sklearn.model_selection import train_test_split from pytorch_pretrained_bert import BertTokenizer, BertConfig from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification from tqdm import tqdm, trange import pandas as pd import io import numpy as np import matplotlib.pyplot as plt % matplotlib inline",No,5,22.0 "device = torch.device(""cuda"" if torch.cuda.is_available() else ""cpu"") n_gpu = torch.cuda.device_count() torch.cuda.get_device_name(0)",No,5,23.0 "train['Sentiment'] = train.Sentiment.astype(int) test['Sentiment'] = test.Sentiment.astype(int)",No,5,16.0 "# Create sentence and label lists sentences = train.Phrase.values # We need to add special tokens at the beginning and end of each sentence for BERT to work properly sentences = [""[CLS] "" + sentence + "" [SEP]"" for sentence in sentences] labels = train.Sentiment.values",No,5,77.0 np.unique(labels),No,5,57.0 "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences] print (""Tokenize the first sentence:"") print (tokenized_texts[0])'",No,4,78.0 import seaborn as sns,No,5,22.0 sns.distplot(train.Phrase.apply(lambda x: len(x.split()))),No,5,33.0 "# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. # In the original paper, the authors used a length of 512. MAX_LEN = 32",No,5,77.0 "# Pad our input tokens input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=MAX_LEN, dtype=""long"", truncating=""post"", padding=""post"")",No,5,78.0 "# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]",No,5,78.0 "input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype=""long"", truncating=""post"", padding=""post"")",No,5,78.0 "# Create attention masks attention_masks = [] # Create a mask of 1s for each token followed by 0s for padding for seq in input_ids: seq_mask = [float(i>0) for i in seq] attention_masks.append(seq_mask)",No,5,53.0 "# Use train_test_split to split our data into train and validation sets for training train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2018, test_size=0.1) train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=2018, test_size=0.1)",No,5,13.0 train_labels,No,5,41.0 "# Convert all of our data into torch tensors, the required datatype for our model train_inputs = torch.tensor(train_inputs) validation_inputs = torch.tensor(validation_inputs) train_labels = torch.tensor(train_labels) validation_labels = torch.tensor(validation_labels) train_masks = torch.tensor(train_masks) validation_masks = torch.tensor(validation_masks)",No,5,16.0 "# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32 batch_size = 32 # Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, # with an iterator the entire dataset does not need to be loaded into memory train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) ",No,4,13.0 "# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. model = BertForSequenceClassification.from_pretrained(""bert-base-uncased"", num_labels=5) model.cuda()",No,5,30.0 "param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] ",No,5,59.0 "# This variable contains all of the hyperparemeter information our training loop needs optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5, warmup=.1)",No,3,59.0 "# Function to calculate the accuracy of our predictions vs labels def flat_accuracy(preds, labels): pred_flat = np.argmax(preds, axis=1).flatten() labels_flat = labels.flatten() return np.sum(pred_flat == labels_flat) / len(labels_flat)",No,5,84.0 "train_loss_set = [] # Number of training epochs (authors recommend between 2 and 4) epochs = 4 # trange is a tqdm wrapper around the normal python range for _ in trange(epochs, desc=""Epoch""): # Training # Set our model to training mode (as opposed to evaluation mode) model.train() # Tracking variables tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # Train the data for one epoch for step, batch in enumerate(train_dataloader): # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch # Clear out the gradients (by default they accumulate) optimizer.zero_grad() # Forward pass loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) train_loss_set.append(loss.item()) # Backward pass loss.backward() # Update parameters and take a step using the computed gradient optimizer.step() # Update tracking variables tr_loss += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 print(""Train loss: {}"".format(tr_loss/nb_tr_steps)) # Validation # Put model in evaluation mode to evaluate loss on the validation set model.eval() # Tracking variables eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 # Evaluate data for one epoch for batch in validation_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch # Telling the model not to compute or store gradients, saving memory and speeding up validation with torch.no_grad(): # Forward pass, calculate logit predictions logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() tmp_eval_accuracy = flat_accuracy(logits, label_ids) eval_accuracy += tmp_eval_accuracy nb_eval_steps += 1 print(""Validation Accuracy: {}"".format(eval_accuracy/nb_eval_steps))'",No,4,7.0 "plt.figure(figsize=(15,8)) plt.title(""Training loss"") plt.xlabel(""Batch"") plt.ylabel(""Loss"") plt.plot(train_loss_set) plt.show()",No,5,35.0 "test = pd.read_csv(""../input/test.tsv"", sep='\\t') test_id = test['PhraseId']'",No,4,45.0 "# Create sentence and label lists sentences = test.Phrase.values # We need to add special tokens at the beginning and end of each sentence for BERT to work properly sentences = [""[CLS] "" + sentence + "" [SEP]"" for sentence in sentences] # labels = test.Sentiment.values tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]",No,5,78.0 "MAX_LEN = 32 # Pad our input tokens input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=MAX_LEN, dtype=""long"", truncating=""post"", padding=""post"") # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts] input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype=""long"", truncating=""post"", padding=""post"") # Create attention masks attention_masks = [] # Create a mask of 1s for each token followed by 0s for padding for seq in input_ids: seq_mask = [float(i>0) for i in seq] attention_masks.append(seq_mask) ",Yes,5,78.0 "prediction_inputs = torch.tensor(input_ids) prediction_masks = torch.tensor(attention_masks) prediction_labels = torch.tensor(labels) batch_size = 32 ",Yes,5,16.0 "prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) prediction_sampler = SequentialSampler(prediction_data) prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)",No,4,13.0 "# Prediction on test set # Put model in evaluation mode model.eval() # Tracking variables predictions , true_labels = [], [] # Predict for batch in prediction_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch # Telling the model not to compute or store gradients, saving memory and speeding up prediction with torch.no_grad(): # Forward pass, calculate logit predictions logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Store predictions and true labels predictions.append(logits) # true_labels.append(label_ids)",No,5,48.0 preds = np.concatenate(predictions),No,5,11.0 "# preds, target = learn_classifier.get_preds(DatasetType.Test, ordered=True) labels = np.argmax(preds, axis =1) ",No,2,8.0 "submission = pd.DataFrame({'PhraseId': test_id, 'Sentiment': labels}) submission.to_csv('submission.csv', index=False) submission.head()",Yes,5,25.0 "def mlen(row): s=row['lemma'].split(' ') return len(s) train_df['len']=train_df.apply(mlen,axis=1) train_df.head()",No,5,8.0 max(train_df['len']),No,5,40.0 "test_df['len']=test_df.apply(mlen,axis=1) max(test_df['len'])",No,5,8.0 "from sklearn.preprocessing import OneHotEncoder ohe=OneHotEncoder() Y_train=train_df.iloc[:,3] Y_train=Y_train.as_matrix() Y_train=Y_train.reshape(-1,1) Y_tr=ohe.fit_transform(Y_train)",No,5,20.0 "from keras.models import Sequential from keras.layers import Dense,Dropout,Embedding,LSTM",No,5,22.0 model.summary(),No,5,79.0 "model.fit(X_train,Y_tr,batch_size=64,epochs=15)",No,5,7.0 "s=model.predict(X_test) s=np.argmax(s,axis=1) print(s) s=pd.DataFrame(s) s['PhraseId']=test_df['PhraseId'] s.columns=['Sentiment','PhraseId'] s=s[['PhraseId','Sentiment']] s.to_csv('submissions.csv',index=False)",No,4,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) # Any results you write to the current directory are saved as output.",No,5,88.0 "try: %tensorflow_version 2.x except Exception: pass import tensorflow as tf # The Natural Language Toolkit, or more commonly NLTK, is a suite of libraries and programs for symbolic and # statistical natural language processing for English written in the Python programming language. import nltk from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() from bs4 import BeautifulSoup import re #TQDM is a progress bar library with good support for nested loops and Jupyter/IPython notebooks. from tqdm import tqdm",No,5,84.0 "from keras.utils import to_categorical import random from sklearn.model_selection import train_test_split from keras.preprocessing import sequence from keras.preprocessing.text import Tokenizer from keras.layers import Dense,Dropout,Embedding,LSTM from keras.callbacks import EarlyStopping from keras.losses import categorical_crossentropy from keras.optimizers import Adam from keras.models import Sequential #set random seed for the session and also for tensorflow that runs in background for keras tf.random.set_seed(123) random.seed(123)",No,5,23.0 "from zipfile import ZipFile for zip_path in ['../input/sentiment-analysis-on-movie-reviews/train.tsv.zip', '../input/sentiment-analysis-on-movie-reviews/test.tsv.zip']: with ZipFile(zip_path, 'r') as zip: # printing all the contents of the zip file zip.printdir() # extracting all the files print('Extracting all the files now...') zip.extractall() print('Done!') ",No,5,73.0 "sample = pd.read_csv(""../input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv"") train = pd.read_csv(""train.tsv"", delimiter='\\t') test = pd.read_csv(""test.tsv"", delimiter='\\t')'",No,5,45.0 "train.shape, test.shape",No,5,58.0 " def clean_sentences(df): reviews = [] for sent in tqdm(df['Phrase']): #remove html content review_text = BeautifulSoup(sent).get_text() #remove non-alphabetic characters review_text = re.sub(""[^a-zA-Z]"","" "", review_text) #tokenize the sentences words = word_tokenize(review_text.lower()) #lemmatize each word to its lemma lemma_words = [lemmatizer.lemmatize(i) for i in words] reviews.append(lemma_words) return(reviews) train_sentences = clean_sentences(train) test_sentences = clean_sentences(test) print(len(train_sentences)) print(len(test_sentences))'",No,5,78.0 "target=train.Sentiment.values y_target=to_categorical(target) num_classes=y_target.shape[1]",No,5,20.0 "X_train,X_val,y_train,y_val = train_test_split(train_sentences, y_target, test_size=0.2, stratify=y_target)",No,5,13.0 "unique_words = set() len_max = 0 for sent in tqdm(X_train): unique_words.update(sent) if(len_max= 0].reset_index(drop=True) # y_train = X_train[['Store', 'Dept', 'Date', 'Weekly_Sales']] # X_train.drop('Weekly_Sales', axis=1, inplace=True) # y_train_no_neg = X_train_no_neg[['Store', 'Dept', 'Date', 'Weekly_Sales']] # X_train_no_neg.drop('Weekly_Sales', axis=1, inplace=True) # print(X_train.shape, X_train_no_neg.shape, (X_train.shape[0] - X_train_no_neg.shape[0]) / X_train.shape[0]) X_train.head()",No,4,45.0 X_train['Date'].dtype,No,5,70.0 "train=pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip"") train.head()",Yes,4,45.0 "test=pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip"") test.head()",Yes,5,45.0 "tesst=test.drop([""Date""],axis=1) tesst.head()",Yes,3,10.0 "sample=pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip"") sample.head()",Yes,4,7.0 "sample.to_csv(""20200309.csv"",index=False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) os.cpu_count() # Any results you write to the current directory are saved as output.",Yes,5,88.0 "train = pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip"",parse_dates=[""Date""]) test = pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip"",parse_dates=[""Date""])",No,5,45.0 "add = pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip"") add.head()",Yes,4,45.0 "store = pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv"") store",Yes,5,45.0 "import seaborn as sns import matplotlib.pyplot as plt ax = sns.barplot(x=""Type"", y=""Size"", hue=""Type"", data=store) sns.catplot(x=""Type"", kind=""count"", palette=""ch:.25"", data=store)",No,5,33.0 "train = pd.merge(train,store,on = ""Store"",how=""left"") test = pd.merge(test,store,on = ""Store"",how=""left"")",No,5,32.0 "train[""year""] = train[""Date""].dt.year train[""month""] = train[""Date""].dt.month train[""day""] = train[""Date""].dt.day train[""week""] = train[""Date""].dt.week test[""year""] = test[""Date""].dt.year test[""month""] = test[""Date""].dt.month test[""day""] = test[""Date""].dt.day test[""week""] = test[""Date""].dt.week",No,5,8.0 "train[""Type""] = train[""Type""].replace({""A"":0,""B"":1,""C"":2}) test[""Type""] = test[""Type""].replace({""A"":0,""B"":1,""C"":2})",No,5,20.0 "def wmae(y_pred, targ, holiday_week): sumOfWeights = 0 sumofCol_B_X_Col_E = 0 for i in range(0, len(y_pred)): weight = 0 if holiday_week[i]: weight = 5 else: weight = 1 Col_B_X_Col_E = abs(targ[i] - y_pred[i])*weight sumOfWeights += weight sumofCol_B_X_Col_E += Col_B_X_Col_E WMAE = sumofCol_B_X_Col_E/sumOfWeights return WMAE",No,5,84.0 "train.groupby(""IsHoliday"")[""Weekly_Sales""].median()",No,3,40.0 "plt.figure(figsize=(10,6)) sns.barplot(train[""IsHoliday""],train[""Weekly_Sales""])",No,5,33.0 "train.groupby(""Type"")[""Weekly_Sales""].mean()",No,4,40.0 "plt.figure(figsize=(10,6)) sns.barplot(train[""Type""],train[""Weekly_Sales""])",No,5,33.0 "plt.figure(figsize=(10,6)) sns.boxplot(train[""year""],train[""Weekly_Sales""],showfliers=False)",No,5,75.0 "plt.figure(figsize=(10,6)) sns.boxplot(train[""Store""],train[""Weekly_Sales""],showfliers=False)",No,5,33.0 "plt.figure(figsize=(10,6)) sns.boxplot(train[""Dept""],train[""Weekly_Sales""],showfliers=False)",No,5,33.0 "feature = [""Store"",""Dept"",""year"",""month"",""day"",""week"",""IsHoliday"", ""Size""]",No,5,77.0 "X_train = train[feature] X_test = test[feature]",No,5,21.0 X_train,No,5,41.0 "y_train = train[""Weekly_Sales""]",No,5,21.0 "from sklearn.model_selection import train_test_split X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train,y_train, test_size=0.2)",Yes,5,13.0 "%%time from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor(n_estimators=10,n_jobs=4) ",Yes,5,4.0 "model.fit(X_train1,y_train1)",No,5,7.0 "(pd.DataFrame([X_train.columns,model.feature_importances_],columns=feature).T).plot.bar()",No,5,79.0 "holidays = X_test1['IsHoliday'].to_numpy() y_test1 = y_test1.to_numpy()",No,5,16.0 result = model.predict(X_test1),No,5,48.0 "WMAE = wmae(result, y_test1, holidays) print(WMAE)",No,5,49.0 "'''from sklearn.model_selection import GridSearchCV # Create the parameter grid based on the results of random search param_grid = { 'bootstrap': [True], 'max_depth': [80, 90, 100, 110], 'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5], 'min_samples_split': [8, 10, 12], 'n_estimators': [10, 100, 200] } # Create a based model rf = RandomForestRegressor() # Instantiate the grid search model grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 2, n_jobs = 3, verbose = 2)''' ",Yes,4,5.0 grid_search.best_params_,No,5,2.0 "data_dmatrix = xgb.DMatrix(data=X_train,label=y_train)",No,5,84.0 "xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1, # max_depth = 5, alpha = 10, n_estimators = 10)",No,5,4.0 "xg_reg.fit(X_train1,y_train1) result = xg_reg.predict(X_test1)",Yes,4,7.0 " %%time from sklearn.ensemble import RandomForestRegressor #model = RandomForestRegressor(bootstrap= True, max_depth= 110, max_features= 3, min_samples_leaf= 3, min_samples_split= 8, n_estimators= 200) model = RandomForestRegressor(n_estimators= 500) model.fit(X_train,y_train) result = model.predict(X_test)",Yes,4,7.0 "sub = pd.read_csv(""/kaggle/input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip"")",No,5,45.0 "sub[""Weekly_Sales""] = result sub.head()",Yes,4,55.0 "sub.to_csv(""walmart_predict_sub.csv"",index=False)",No,5,25.0 "import numpy as np import scipy as sp import pandas as pd import matplotlib.pyplot as plt import random import os import sys import re from datetime import datetime import math from subprocess import check_output print(check_output([""ls"", ""../input""]).decode(""utf8""))",No,4,22.0 "pd.options.display.max_colwidth = 400 from IPython.display import FileLink, FileLinks",Yes,5,23.0 "train = pd.read_csv('/kaggle/input/web-traffic-time-series-forecasting/train_2.csv.zip') keys = pd.read_csv('/kaggle/input/web-traffic-time-series-forecasting/key_2.csv.zip') sub = pd.read_csv('/kaggle/input/web-traffic-time-series-forecasting/sample_submission_2.csv.zip')",No,5,45.0 keys.head(),No,5,41.0 "def split_page_col(page): tokens = page.split('_') article_name = ''.join(tokens[:-3]) org = tokens[-3] access = tokens[-2] crawler = tokens[-1] return (article_name, org, access, crawler) def split_page_col_wdate(page): tokens = page.split('_') article_name = ''.join(tokens[:-4]) org = tokens[-4] access = tokens[-3] crawler = tokens[-2] date = tokens[-1] return (article_name, org, access, crawler,date)",No,5,78.0 "keys['date'] = keys['Page'].apply(lambda x: x.split('_')[-1]) keys['Page'] = keys['Page'].apply(lambda x: '_'.join(x.split('_')[:-1])) keys['date'] = pd.to_datetime(keys['date'], format='%Y-%m-%d')",No,5,8.0 keys.tail(),No,5,41.0 "sub = sub.merge(keys, on='Id', how='left')",No,5,32.0 "sub['date'] = pd.to_datetime(sub['date'], format='%Y-%m-%d')",No,5,16.0 "print(sub['date'].min(), sub['date'].max()) print(sub['date'].max() - sub['date'].min())",No,3,40.0 "print(sub['Page'].nunique()) print(train['Page'].nunique())",No,5,54.0 "train.iloc[:, 755:803]",No,5,14.0 "print(sub.shape) print(train.shape)",No,5,58.0 "prev_year_data_cols = pd.date_range('2016-09-13', '2016-11-13') train_flat = pd.melt(train.loc[:, ['Page'] + list(prev_year_data_cols.date.astype(str))], id_vars='Page', var_name='date') train_flat['date'] = pd.to_datetime(train_flat['date'], format='%Y-%m-%d')",Yes,4,16.0 train_flat.head(),No,5,41.0 "train_flat['prediction_date'] = train_flat['date'] + pd.DateOffset(years=1) sub = sub[['Page', 'date', 'Id']].merge(train_flat[['Page', 'prediction_date', 'value']], left_on=('Page', 'date'), right_on=('Page', 'prediction_date')) sub['value'] = sub['value'].fillna(0)",Yes,3,32.0 "sub[['Id', 'value']].rename(columns={'value': 'visits'}).to_csv('all_submission.csv', index=False) FileLink('all_submission.csv')",Yes,5,25.0 "page_median = train.iloc[:, 1:].median(axis=1, skipna=True)",No,3,40.0 "page_median = pd.DataFrame({'Page': train['Page'], 'median': page_median})",No,5,12.0 page_median.head(),No,5,41.0 "sub_median = sub.merge(page_median, on='Page')[['Id', 'median']]",No,5,32.0 sub_median.isnull().mean(),No,3,40.0 sub_median,No,5,53.0 "sub_median.rename(columns={'median': 'visits'}).to_csv('submission.csv', index=False) FileLink('submission.csv')",Yes,5,25.0 "prev_year_data_cols = pd.date_range('2016-09-13', '2016-11-13') prev_year_median = train.loc[:, list(prev_year_data_cols.date.astype(str))].median(axis=1, skipna=True) ",Yes,5,77.0 "prev_year_median = pd.DataFrame({'Page': train['Page'], 'visits': prev_year_median})",No,5,12.0 "sub_prev_year_median = sub.merge(prev_year_median, on='Page')[['Id', 'visits']]",No,5,32.0 sub_prev_year_median.isnull().mean(),No,4,57.0 sub_prev_year_median['visits'] = sub_prev_year_median['visits'].fillna(sub_prev_year_median['visits'].median()),No,5,17.0 "median_60 = train.iloc[:, -60:].median(axis=1, skipna=True) median_60 = pd.DataFrame({'Page': train['Page'], 'visits': median_60}) sub_median_60 = sub.merge(median_60, on='Page')[['Id', 'visits']]",Yes,4,12.0 sub_median_60.isnull().mean(),No,3,57.0 sub_median_60['visits'] = sub_median_60['visits'].fillna(0),No,5,17.0 "# Libraries import numpy as np import pandas as pd pd.set_option('max_columns', None) import matplotlib.pyplot as plt import seaborn as sns plt.style.use('seaborn') %matplotlib inline import copy import datetime import lightgbm as lgb from scipy import stats from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score, GridSearchCV, RepeatedStratifiedKFold from sklearn.preprocessing import StandardScaler import os import plotly.offline as py py.init_notebook_mode(connected=True) import plotly.graph_objs as go import plotly.tools as tls import xgboost as xgb import lightgbm as lgb from sklearn import model_selection from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, classification_report, confusion_matrix import json import ast import time from sklearn import linear_model import warnings warnings.filterwarnings('ignore') import os import glob from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing import LabelEncoder",Yes,5,23.0 "class LGBWrapper(object): """""" A wrapper for lightgbm model so that we will have a single api for various models. """""" def __init__(self): self.model = lgb.LGBMClassifier() def fit(self, X_train, y_train, X_valid=None, y_valid=None, X_holdout=None, y_holdout=None, params=None): eval_set = [(X_train, y_train)] eval_names = ['train'] self.model = self.model.set_params(**params) if X_valid is not None: eval_set.append((X_valid, y_valid)) eval_names.append('valid') if X_holdout is not None: eval_set.append((X_holdout, y_holdout)) eval_names.append('holdout') if 'cat_cols' in params.keys(): cat_cols = [col for col in params['cat_cols'] if col in X_train.columns] if len(cat_cols) > 0: categorical_columns = params['cat_cols'] else: categorical_columns = 'auto' else: categorical_columns = 'auto' self.model.fit(X=X_train, y=y_train, eval_set=eval_set, eval_names=eval_names, verbose=params['verbose'], early_stopping_rounds=params['early_stopping_rounds']) self.best_score_ = self.model.best_score_ self.feature_importances_ = self.model.feature_importances_ def predict_proba(self, X_test): if self.model.objective == 'binary': return self.model.predict_proba(X_test, num_iteration=self.model.best_iteration_)[:, 1] else: return self.model.predict_proba(X_test, num_iteration=self.model.best_iteration_)'",Yes,5,53.0 "class ClassifierModel(object): """""" A wrapper class for classification models. It can be used for training and prediction. Can plot feature importance and training progress (if relevant for model). """""" def __init__(self, columns: list = None, model_wrapper=None): """""" :param original_columns: :param model_wrapper: """""" self.columns = columns self.model_wrapper = model_wrapper self.result_dict = {} self.train_one_fold = False self.preprocesser = None def fit(self, X: pd.DataFrame, y, X_holdout: pd.DataFrame = None, y_holdout=None, folds=None, params: dict = None, eval_metric='auc', cols_to_drop: list = None, preprocesser=None, transformers: dict = None, adversarial: bool = False, plot: bool = True): """""" Training the model. :param X: training data :param y: training target :param X_holdout: holdout data :param y_holdout: holdout target :param folds: folds to split the data. If not defined, then model will be trained on the whole X :param params: training parameters :param eval_metric: metric for validataion :param cols_to_drop: list of columns to drop (for example ID) :param preprocesser: preprocesser class :param transformers: transformer to use on folds :param adversarial :return: """""" self.cols_to_drop = cols_to_drop if folds is None: folds = KFold(n_splits=3, random_state=42) self.train_one_fold = True self.columns = X.columns if self.columns is None else self.columns self.feature_importances = pd.DataFrame(columns=['feature', 'importance']) self.trained_transformers = {k: [] for k in transformers} self.transformers = transformers self.models = [] self.folds_dict = {} self.eval_metric = eval_metric n_target = 1 if len(set(y.values)) == 2 else len(set(y.values)) self.oof = np.zeros((len(X), n_target)) self.n_target = n_target X = X[self.columns] if X_holdout is not None: X_holdout = X_holdout[self.columns] if preprocesser is not None: self.preprocesser = preprocesser self.preprocesser.fit(X, y) X = self.preprocesser.transform(X, y) self.columns = X.columns.tolist() if X_holdout is not None: X_holdout = self.preprocesser.transform(X_holdout) # y = X['accuracy_group'] for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)): if X_holdout is not None: X_hold = X_holdout.copy() else: X_hold = None self.folds_dict[fold_n] = {} if params['verbose']: print(f'Fold {fold_n + 1} started at {time.ctime()}') self.folds_dict[fold_n] = {} X_train, X_valid = X.iloc[train_index], X.iloc[valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] if self.train_one_fold: X_train = X[self.original_columns] y_train = y X_valid = None y_valid = None datasets = {'X_train': X_train, 'X_valid': X_valid, 'X_holdout': X_hold, 'y_train': y_train} X_train, X_valid, X_hold = self.transform_(datasets, cols_to_drop) self.folds_dict[fold_n]['columns'] = X_train.columns.tolist() model = copy.deepcopy(self.model_wrapper) if adversarial: X_new1 = X_train.copy() if X_valid is not None: X_new2 = X_valid.copy() elif X_holdout is not None: X_new2 = X_holdout.copy() X_new = pd.concat([X_new1, X_new2], axis=0) y_new = np.hstack((np.zeros((X_new1.shape[0])), np.ones((X_new2.shape[0])))) X_train, X_valid, y_train, y_valid = train_test_split(X_new, y_new) model.fit(X_train, y_train, X_valid, y_valid, X_hold, y_holdout, params=params) self.folds_dict[fold_n]['scores'] = model.best_score_ if self.oof.shape[0] != len(X): self.oof = np.zeros((X.shape[0], self.oof.shape[1])) if not adversarial: self.oof[valid_index] = model.predict_proba(X_valid).reshape(-1, n_target) fold_importance = pd.DataFrame(list(zip(X_train.columns, model.feature_importances_)), columns=['feature', 'importance']) self.feature_importances = self.feature_importances.append(fold_importance) self.models.append(model) self.feature_importances['importance'] = self.feature_importances['importance'].astype(float) # if params['verbose']: self.calc_scores_() if plot: # print(classification_report(y, self.oof.argmax(1))) print(classification_report(y, (self.oof > 0.5) * 1)) fig, ax = plt.subplots(figsize=(16, 12)) plt.subplot(2, 2, 1) self.plot_feature_importance(top_n=25) plt.subplot(2, 2, 2) self.plot_metric() plt.subplot(2, 2, 3) g = sns.heatmap(confusion_matrix(y, (self.oof > 0.5) * 1), annot=True, cmap=plt.cm.Blues,fmt=""d"") g.set(ylim=(-0.5, 4), xlim=(-0.5, 4), title='Confusion matrix') plt.subplot(2, 2, 4) plt.hist(self.oof) plt.xticks(range(self.n_target), range(self.n_target)) plt.title('Distribution of oof predictions'); def transform_(self, datasets, cols_to_drop): for name, transformer in self.transformers.items(): transformer.fit(datasets['X_train'], datasets['y_train']) datasets['X_train'] = transformer.transform(datasets['X_train']) if datasets['X_valid'] is not None: datasets['X_valid'] = transformer.transform(datasets['X_valid']) if datasets['X_holdout'] is not None: datasets['X_holdout'] = transformer.transform(datasets['X_holdout']) self.trained_transformers[name].append(transformer) if cols_to_drop is not None: cols_to_drop = [col for col in cols_to_drop if col in datasets['X_train'].columns] self.cols_to_drop = cols_to_drop datasets['X_train'] = datasets['X_train'].drop(cols_to_drop, axis=1) if datasets['X_valid'] is not None: datasets['X_valid'] = datasets['X_valid'].drop(cols_to_drop, axis=1) if datasets['X_holdout'] is not None: datasets['X_holdout'] = datasets['X_holdout'].drop(cols_to_drop, axis=1) return datasets['X_train'], datasets['X_valid'], datasets['X_holdout'] def calc_scores_(self): print() datasets = [k for k, v in [v['scores'] for k, v in self.folds_dict.items()][0].items() if len(v) > 0] self.scores = {} for d in datasets: scores = [v['scores'][d][self.eval_metric] for k, v in self.folds_dict.items()] print(f""CV mean score on {d}: {np.mean(scores):.4f} +/- {np.std(scores):.4f} std."") self.scores[d] = np.mean(scores) def predict(self, X_test, averaging: str = 'usual'): """""" Make prediction :param X_test: :param averaging: method of averaging :return: """""" full_prediction = np.zeros((X_test.shape[0], self.oof.shape[1])) if self.preprocesser is not None: X_test = self.preprocesser.transform(X_test) for i in range(len(self.models)): X_t = X_test.copy() for name, transformers in self.trained_transformers.items(): X_t = transformers[i].transform(X_t) if self.cols_to_drop: cols_to_drop = [col for col in self.cols_to_drop if col in X_t.columns] X_t = X_t.drop(cols_to_drop, axis=1) y_pred = self.models[i].predict_proba(X_t[self.folds_dict[i]['columns']]).reshape(-1, full_prediction.shape[1]) # if case transformation changes the number of the rows if full_prediction.shape[0] != len(y_pred): full_prediction = np.zeros((y_pred.shape[0], self.oof.shape[1])) if averaging == 'usual': full_prediction += y_pred elif averaging == 'rank': full_prediction += pd.Series(y_pred).rank().values return full_prediction / len(self.models) def plot_feature_importance(self, drop_null_importance: bool = True, top_n: int = 10): """""" Plot default feature importance. :param drop_null_importance: drop columns with null feature importance :param top_n: show top n columns :return: """""" top_feats = self.get_top_features(drop_null_importance, top_n) feature_importances = self.feature_importances.loc[self.feature_importances['feature'].isin(top_feats)] feature_importances['feature'] = feature_importances['feature'].astype(str) top_feats = [str(i) for i in top_feats] sns.barplot(data=feature_importances, x='importance', y='feature', orient='h', order=top_feats) plt.title('Feature importances') def get_top_features(self, drop_null_importance: bool = True, top_n: int = 10): """""" Get top features by importance. :param drop_null_importance: :param top_n: :return: """""" grouped_feats = self.feature_importances.groupby(['feature'])['importance'].mean() if drop_null_importance: grouped_feats = grouped_feats[grouped_feats != 0] return list(grouped_feats.sort_values(ascending=False).index)[:top_n] def plot_metric(self): """""" Plot training progress. Inspired by `plot_metric` from https://lightgbm.readthedocs.io/en/latest/_modules/lightgbm/plotting.html :return: """""" full_evals_results = pd.DataFrame() for model in self.models: evals_result = pd.DataFrame() for k in model.model.evals_result_.keys(): evals_result[k] = model.model.evals_result_[k][self.eval_metric] evals_result = evals_result.reset_index().rename(columns={'index': 'iteration'}) full_evals_results = full_evals_results.append(evals_result) full_evals_results = full_evals_results.melt(id_vars=['iteration']).rename(columns={'value': self.eval_metric, 'variable': 'dataset'}) full_evals_results[self.eval_metric] = np.abs(full_evals_results[self.eval_metric]) sns.lineplot(data=full_evals_results, x='iteration', y=self.eval_metric, hue='dataset') plt.title('Training progress')'",Yes,5,53.0 "data_dict = {} for i in glob.glob('/kaggle/input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MDataFiles_Stage1/*'): name = i.split('/')[-1].split('.')[0] if name != 'MTeamSpellings': data_dict[name] = pd.read_csv(i) else: data_dict[name] = pd.read_csv(i, encoding='cp1252')",No,3,45.0 data_dict.keys(),No,2,40.0 data_dict['MNCAATourneySeeds'].head(),No,5,41.0 data_dict['MNCAATourneyCompactResults'].head(),No,5,41.0 "data_dict['MNCAATourneyCompactResults'].groupby(['Season'])['WScore'].mean().plot(kind='line'); plt.title('Mean scores of winning teams by season in tourneys');",No,5,75.0 data_dict['MRegularSeasonCompactResults'],No,5,41.0 "data_dict['MRegularSeasonCompactResults'].groupby(['Season'])['WScore'].mean().plot(); plt.title('Mean scores of winning teams by season in regular plays');",No,5,75.0 "# process seed data_dict['MNCAATourneySeeds'] = data_dict['MNCAATourneySeeds'].loc[data_dict['MNCAATourneySeeds']['Season'] <= 2014] data_dict['MNCAATourneySeeds']['Seed'] = data_dict['MNCAATourneySeeds']['Seed'].apply(lambda x: int(x[1:3])) # take only useful columns data_dict['MNCAATourneySeeds'] = data_dict['MNCAATourneySeeds'][['Season', 'TeamID', 'Seed']] data_dict['MNCAATourneyCompactResults'] = data_dict['MNCAATourneyCompactResults'][['Season','WTeamID', 'LTeamID']] data_dict['MNCAATourneyCompactResults'] = data_dict['MNCAATourneyCompactResults'].loc[data_dict['MNCAATourneyCompactResults']['Season'] <= 2014] # merge the data and rename the columns df = pd.merge(data_dict['MNCAATourneyCompactResults'], data_dict['MNCAATourneySeeds'], how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'TeamID']) df = pd.merge(df, data_dict['MNCAATourneySeeds'], how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'TeamID']) df = df.drop(['TeamID_x', 'TeamID_y'], axis=1) df.columns = ['Season', 'WTeamID', 'LTeamID', 'WSeed', 'LSeed'] df.head()",Yes,3,8.0 "team_win_score = data_dict['MRegularSeasonCompactResults'].groupby(['Season', 'WTeamID']).agg({'WScore':['sum', 'count']}).reset_index() team_win_score.columns = ['Season', 'WTeamID', 'WScore_sum', 'WScore_count'] team_loss_score = data_dict['MRegularSeasonCompactResults'].groupby(['Season', 'LTeamID']).agg({'LScore':['sum', 'count']}).reset_index() team_loss_score.columns = ['Season', 'LTeamID', 'LScore_sum', 'LScore_count'] df = pd.merge(df, team_win_score, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'WTeamID']) df = pd.merge(df, team_loss_score, how='left', left_on=['Season', 'LTeamID'], right_on=['Season', 'LTeamID']) df = pd.merge(df, team_loss_score, how='left', left_on=['Season', 'WTeamID'], right_on=['Season', 'LTeamID']) df = pd.merge(df, team_win_score, how='left', left_on=['Season', 'LTeamID_x'], right_on=['Season', 'WTeamID']) df.drop(['LTeamID_y', 'WTeamID_y'], axis=1, inplace=True) df.head()",Yes,3,32.0 "df_win = df.copy() df_los = df.copy() df_win = df_win[['WSeed', 'LSeed', 'x_score', 'y_score', 'x_count', 'y_count']] df_los = df_los[['LSeed', 'WSeed', 'y_score', 'x_score', 'y_count', 'x_count']] df_win.columns = ['Seed_1', 'Seed_2', 'Score_1', 'Score_2', 'Count_1', 'Count_2'] df_los.columns = ['Seed_1', 'Seed_2', 'Score_1', 'Score_2', 'Count_1', 'Count_2']",Yes,4,12.0 "df_win['Seed_diff'] = df_win['Seed_1'] - df_win['Seed_2'] df_win['Score_diff'] = df_win['Score_1'] - df_win['Score_2'] df_los['Seed_diff'] = df_los['Seed_1'] - df_los['Seed_2'] df_los['Score_diff'] = df_los['Score_1'] - df_los['Score_2'] df_win['Count_diff'] = df_win['Count_1'] - df_win['Count_2'] df_win['Mean_score1'] = df_win['Score_1'] / df_win['Count_1'] df_win['Mean_score2'] = df_win['Score_2'] / df_win['Count_2'] df_win['Mean_score_diff'] = df_win['Mean_score1'] - df_win['Mean_score2'] df_los['Count_diff'] = df_los['Count_1'] - df_los['Count_2'] df_los['Mean_score1'] = df_los['Score_1'] / df_los['Count_1'] df_los['Mean_score2'] = df_los['Score_2'] / df_los['Count_2'] df_los['Mean_score_diff'] = df_los['Mean_score1'] - df_los['Mean_score2']",No,3,8.0 "df_win['result'] = 1 df_los['result'] = 0 data = pd.concat((df_win, df_los)).reset_index(drop=True)",No,4,11.0 "for col in ['Score_1', 'Score_2', 'Count_1', 'Count_2', 'Score_diff', 'Count_diff']: print(col) data[col] = data[col].fillna(0).astype(int)",No,5,17.0 "X = data.drop(['result'], axis=1) y = data['result']",No,5,21.0 "# some of params are from this kernel: https://www.kaggle.com/ratan123/march-madness-2020-ncaam-simple-lightgbm-on-kfold param = {'n_estimators':10000, 'num_leaves': 400, 'min_child_weight': 0.034, 'feature_fraction': 0.379, 'bagging_fraction': 0.418, 'min_data_in_leaf': 106, 'objective': 'binary', 'max_depth': -1, 'learning_rate': 0.007, ""boosting_type"": ""gbdt"", #""bagging_seed"": 11, ""metric"": 'binary_logloss', ""verbosity"": 10, 'reg_alpha': 0.3899, 'reg_lambda': 0.648, 'random_state': 47, 'task':'train', 'nthread':-1, 'verbose': 100, 'early_stopping_rounds': 30, 'eval_metric': 'binary_logloss' } cat_cols = [] mt = MainTransformer(create_interactions=False) # ct = CategoricalTransformer(drop_original=True, cat_cols=cat_cols) ft = FeatureTransformer() transformers = {'ft': ft} lgb_model = ClassifierModel(model_wrapper=LGBWrapper()) lgb_model.fit(X=X, y=y, folds=folds, params=param, preprocesser=mt, transformers=transformers, eval_metric='binary_logloss', cols_to_drop=None, plot=True)'",Yes,3,59.0 "test = pd.read_csv('../input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MSampleSubmissionStage1_2020.csv') test = test.drop(['Pred'], axis=1) test['Season'] = test['ID'].apply(lambda x: int(x.split('_')[0])) test['Team1'] = test['ID'].apply(lambda x: int(x.split('_')[1])) test['Team2'] = test['ID'].apply(lambda x: int(x.split('_')[2])) test = pd.merge(test, data_dict['MNCAATourneySeeds'], how='left', left_on=['Season', 'Team1'], right_on=['Season', 'TeamID']) test = pd.merge(test, data_dict['MNCAATourneySeeds'], how='left', left_on=['Season', 'Team2'], right_on=['Season', 'TeamID']) test = pd.merge(test, team_win_score, how='left', left_on=['Season', 'Team1'], right_on=['Season', 'WTeamID']) test = pd.merge(test, team_loss_score, how='left', left_on=['Season', 'Team2'], right_on=['Season', 'LTeamID']) test = pd.merge(test, team_loss_score, how='left', left_on=['Season', 'Team1'], right_on=['Season', 'LTeamID']) test = pd.merge(test, team_win_score, how='left', left_on=['Season', 'Team2'], right_on=['Season', 'WTeamID']) test['seed_diff'] = test['Seed_x'] - test['Seed_y']",Yes,2,45.0 "test['x_score'] = test['WScore_sum_x'] + test['LScore_sum_y'] test['y_score'] = test['WScore_sum_y'] + test['LScore_sum_x'] test['x_count'] = test['WScore_count_x'] + test['LScore_count_y'] test['y_count'] = test['WScore_count_y'] + test['WScore_count_x']",No,5,8.0 "test = test[['Seed_x', 'Seed_y', 'x_score', 'y_score', 'x_count', 'y_count']] test.columns = ['Seed_1', 'Seed_2', 'Score_1', 'Score_2', 'Count_1', 'Count_2']",No,4,10.0 "test['Seed_diff'] = test['Seed_1'] - test['Seed_2'] test['Score_diff'] = test['Score_1'] - test['Score_2'] test['Seed_diff'] = test['Seed_1'] - test['Seed_2'] test['Score_diff'] = test['Score_1'] - test['Score_2'] test['Count_diff'] = test['Count_1'] - test['Count_2'] test['Mean_score1'] = test['Score_1'] / test['Count_1'] test['Mean_score2'] = test['Score_2'] / test['Count_2'] test['Mean_score_diff'] = test['Mean_score1'] - test['Mean_score2'] test['Count_diff'] = test['Count_1'] - test['Count_2'] test['Mean_score1'] = test['Score_1'] / test['Count_1'] test['Mean_score2'] = test['Score_2'] / test['Count_2'] test['Mean_score_diff'] = test['Mean_score1'] - test['Mean_score2']",No,5,8.0 test_preds = lgb_model.predict(test),No,5,48.0 plt.hist(test_preds);,No,5,56.0 "submission_df = pd.read_csv('../input/google-cloud-ncaa-march-madness-2020-division-1-mens-tournament/MSampleSubmissionStage1_2020.csv') submission_df['Pred'] = test_preds submission_df",Yes,5,45.0 "submission_df.to_csv('submission.csv', index=False)",No,5,25.0 "import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt from itertools import cycle, islice import seaborn as sb import matplotlib.dates as dates import datetime as dt import plotly.offline as py py.init_notebook_mode(connected=True) from plotly import tools, subplots import plotly.figure_factory as ff import plotly.express as px import plotly.graph_objects as go import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename))",No,4,88.0 "train_data = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-4/train.csv"")#index_col=0 display(train_data.head()) test_data = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-4/test.csv"")#index_col=0 display(test_data.head())",Yes,4,45.0 "sum_df = pd.pivot_table(train_data, values=['ConfirmedCases','Fatalities'], index=['Date'],aggfunc=np.sum) display(sum_df.max())",Yes,3,12.0 "def getColumnInfo(df): n_province = df['Province_State'].nunique() n_country = df['Country_Region'].nunique() n_days = df['Date'].nunique() start_date = df['Date'].unique()[0] end_date = df['Date'].unique()[-1] return n_province, n_country, n_days, start_date, end_date n_train = train_data.shape[0] n_test = test_data.shape[0] n_prov_train, n_count_train, n_train_days, start_date_train, end_date_train = getColumnInfo(train_data) n_prov_test, n_count_test, n_test_days, start_date_test, end_date_test = getColumnInfo(test_data) print ('<==Train data==> \ # of Province_State: '+str(n_prov_train),', # of Country_Region:'+str(n_count_train), ', Time Period: '+str(start_date_train)+' to '+str(end_date_train), '==> days:',str(n_train_days)) print(""\ Countries with Province/State information: "", train_data[train_data['Province_State'].isna()==False]['Country_Region'].unique()) print ('\ <==Test data==> \ # of Province_State: '+str(n_prov_test),', # of Country_Region:'+str(n_count_test), ', Time Period: '+start_date_test+' to '+end_date_test, '==> days:',n_test_days) df_test = test_data.loc[test_data.Date > '2020-04-14'] overlap_days = n_test_days - df_test.Date.nunique() print('\ overlap days with training data: ', overlap_days, ', total days: ', n_train_days+n_test_days-overlap_days)'",No,3,54.0 "prob_confirm_check_train = train_data.ConfirmedCases.value_counts(normalize=True) prob_fatal_check_train = train_data.Fatalities.value_counts(normalize=True) n_confirm_train = train_data.ConfirmedCases.value_counts()[1:].sum() n_fatal_train = train_data.Fatalities.value_counts()[1:].sum() print('Percentage of confirmed case records = {0:<2.0f}/{1:<2.0f} = {2:<2.1f}%'.format(n_confirm_train, n_train, prob_confirm_check_train[1:].sum()*100)) print('Percentage of fatality records = {0:<2.0f}/{1:<2.0f} = {2:<2.1f}%'.format(n_fatal_train, n_train, prob_fatal_check_train[1:].sum()*100))",No,3,72.0 "from itertools import cycle, islice discrete_col = list(islice(cycle(['orange', 'r', 'g', 'k', 'b', 'c', 'm']), None, len(train_data_by_country_confirm.head(30)))) plt.rcParams.update({'font.size': 22}) train_data_by_country_confirm.head(20).plot(figsize=(20,15), kind='barh', color=discrete_col) plt.legend([""Confirmed Cases"", ""Fatalities""]); plt.xlabel(""Number of Covid-19 Affectees"") plt.title(""First 20 Countries with Highest Confirmed Cases"") ylocs, ylabs = plt.yticks() for i, v in enumerate(train_data_by_country_confirm.head(20)[""ConfirmedCases""][:]): plt.text(v+0.01, ylocs[i]-0.25, str(int(v)), fontsize=12) for i, v in enumerate(train_data_by_country_confirm.head(20)[""Fatalities""][:]): if v > 0: #disply for only >300 fatalities plt.text(v+0.01,ylocs[i]+0.1,str(int(v)),fontsize=12) '",No,4,33.0 "def reformat_time(reformat, ax): ax.xaxis.set_major_locator(dates.WeekdayLocator()) ax.xaxis.set_major_formatter(dates.DateFormatter('%b %d')) if reformat: #reformat again if you wish date_list = train_data_by_date.reset_index()[""Date""].tolist() x_ticks = [dt.datetime.strftime(t,'%Y-%m-%d') for t in date_list] x_ticks = [tick for i,tick in enumerate(x_ticks) if i%8==0 ]# split labels into same number of ticks as by pandas ax.set_xticklabels(x_ticks, rotation=90) # cosmetics ax.yaxis.grid(linestyle='dotted') ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') ax.spines['left'].set_color('none') ax.spines['bottom'].set_color('none') train_data['Date'] = pd.to_datetime(train_data['Date']) train_data_by_date = train_data.groupby(['Date'],as_index=True).agg({'ConfirmedCases': 'sum','Fatalities': 'sum', 'NewConfirmedCases':'sum', 'NewFatalities':'sum', 'MortalityRate':'mean'}) num0 = train_data_by_date._get_numeric_data() num0[num0 < 0.0] = 0.0 #display(train_data_by_date.head()) ## ======= Sort by countries with fatalities > 600 ======== train_data_by_country_max = train_data.groupby(['Country_Region'],as_index=True).agg({'ConfirmedCases': 'max', 'Fatalities': 'max'}) train_data_by_country_fatal = train_data_by_country_max[train_data_by_country_max['Fatalities']>600] train_data_by_country_fatal = train_data_by_country_fatal.sort_values(by=['Fatalities'],ascending=False).reset_index() #display(train_data_by_country_fatal.head(20)) df_merge_by_country = pd.merge(train_data,train_data_by_country_fatal['Country_Region'],on=['Country_Region'],how='inner') df_max_fatality_country = df_merge_by_country.groupby(['Date','Country_Region'],as_index=False).agg({'ConfirmedCases': 'sum', 'Fatalities': 'sum', 'NewConfirmedCases':'sum', 'NewFatalities':'sum', 'MortalityRate':'mean'}) num1 = df_max_fatality_country._get_numeric_data() num1[num1 < 0.0] = 0.0 df_max_fatality_country.set_index('Date',inplace=True) #display(df_max_fatality_country.head(20)) countries = train_data_by_country_fatal['Country_Region'].unique() plt.rcParams.update({'font.size': 16}) fig,(ax0,ax1) = plt.subplots(1,2,figsize=(15, 8)) fig,(ax2,ax3) = plt.subplots(1,2,figsize=(15, 8))#,sharey=True) train_data_by_date.ConfirmedCases.plot(ax=ax0, x_compat=True, title='Confirmed Cases Globally', legend='Confirmed Cases', color=discrete_col)#, logy=True) reformat_time(0,ax0) train_data_by_date.NewConfirmedCases.plot(ax=ax0, x_compat=True, linestyle='dotted', legend='New Confirmed Cases', color=discrete_col)#, logy=True) reformat_time(0,ax0) train_data_by_date.Fatalities.plot(ax=ax2, x_compat=True, title='Fatalities Globally', legend='Fatalities', color='r') reformat_time(0,ax2) train_data_by_date.NewFatalities.plot(ax=ax2, x_compat=True, linestyle='dotted', legend='Daily Deaths',color='r')#tell pandas not to use its own datetime format reformat_time(0,ax2) for country in countries: match = df_max_fatality_country.Country_Region==country df_fatality_by_country = df_max_fatality_country[match] df_fatality_by_country.ConfirmedCases.plot(ax=ax1, x_compat=True, title='Confirmed Cases Nationally') reformat_time(0,ax1) df_fatality_by_country.Fatalities.plot(ax=ax3, x_compat=True, title='Fatalities Nationally') reformat_time(0,ax3) #ax1.legend(countries) #ax3.legend(countries) ax1.legend(countries, loc='center left',bbox_to_anchor=(1.0, 0.5)) ax3.legend(countries, loc='center left',bbox_to_anchor=(1.0, 0.5)) '",No,4,75.0 "fig = plt.figure() fig,(ax4,ax5) = plt.subplots(1,2,figsize=(20, 8)) #train_data_by_date.loc[(train_data_by_date.ConfirmedCases > 200)]#useless, its already summed. train_data_by_date.MortalityRate.plot(ax=ax4, x_compat=True, legend='Mortality Rate',color='r')#tell pandas not to use its own datetime format reformat_time(0,ax4) for num, country in enumerate(countries): match = df_max_fatality_country.Country_Region==country df_fatality_by_country = df_max_fatality_country[match] df_fatality_by_country.MortalityRate.plot(ax=ax5, x_compat=True, title='Average Mortality Rate Nationally') reformat_time(0,ax5) ax5.legend(countries, loc='center left',bbox_to_anchor=(1.0, 0.5))",No,4,75.0 "train_data_by_max_date = train_data_by_country.query('(Date == @max_train_date) & (ConfirmedCases > 100)') train_data_by_max_date.loc[:, 'MortalityRate'] = train_data_by_max_date.loc[:,'Fatalities']/train_data_by_max_date.loc[:,'ConfirmedCases'] train_data_by_mortality = train_data_by_max_date.sort_values('MortalityRate', ascending=False) train_data_by_mortality.set_index('Country_Region', inplace=True) #display(train_data_by_mortality.head()) palette = plt.get_cmap('OrRd_r') rainbow_col = [palette(1.*i/20.0) for i in range(20)] train_data_by_mortality.MortalityRate.head(20).plot(figsize=(15,10), kind='barh', color=rainbow_col) plt.xlabel(""Mortality Rate"") plt.title(""First 20 Countries with Highest Mortality Rate"") ylocs, ylabs = plt.yticks() '",No,5,33.0 "#import plotly.io as pio # to set shahin plot layout world_df = train_data_by_country.query('Date == @max_train_date') world_df.loc[:,'Date'] = world_df.loc[:,'Date'].apply(str) world_df.loc[:,'Confirmed_log'] = round(np.log10(world_df.loc[:,'ConfirmedCases'] + 1), 3) world_df.loc[:,'Fatalities_log'] = np.log10(world_df.loc[:,'Fatalities'] + 1) world_df.loc[:,'MortalityRate'] = round(world_df.loc[:, 'Fatalities'] / world_df.loc[:,'ConfirmedCases'], 3) world_df.loc[:,'GrowthFactor'] = round(world_df.loc[:,'GrowthRate'], 3) #display(world_df.head()) fig1 = px.choropleth(world_df, locations=""Country_Region"", locationmode=""country names"", color=""Confirmed_log"", hover_name=""Country_Region"", hover_data=['ConfirmedCases', 'Fatalities', 'MortalityRate', 'GrowthFactor'], range_color=[world_df['Confirmed_log'].min(), world_df['Confirmed_log'].max()], color_continuous_scale = px.colors.sequential.Plasma, title='COVID-19: Confirmed Cases') fig1.show() '",No,5,84.0 "fig2 = px.scatter_geo(world_df, locations=""Country_Region"", locationmode=""country names"", color=""ConfirmedCases"", size='ConfirmedCases', hover_name=""Country_Region"", hover_data=['ConfirmedCases', 'Fatalities', 'MortalityRate', 'GrowthFactor'], range_color= [world_df['Confirmed_log'].min(), world_df['ConfirmedCases'].max()], projection=""natural earth"", animation_frame=""Date"", animation_group=""Country_Region"", color_continuous_scale=""portland"", title='COVID-19: Spread Over Time') #fig2.layout.updatemenus[0].buttons[0].args[1][""frame""][""duration""] = 10 #fig2.layout.updatemenus[0].buttons[0].args[1][""transition""][""duration""] = 10 fig2.layout.coloraxis.showscale = False #fig2.layout.sliders[0].pad.t = 10 #fig2.layout.updatemenus[0].pad.t= 10 fig2.show()'",No,5,84.0 "from sklearn.linear_model import Ridge from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline from tqdm import tqdm plt.rcParams.update({'font.size': 12}) fig,(ax0,ax1) = plt.subplots(1,2,figsize=(20, 8)) countries_europe = ['Italy', 'France', 'Spain', 'Germany', 'United Kingdom'] # Take the 1st day as 2020-02-23 df = train_data.loc[train_data.Date >= '2020-02-23'] n_days_europe = df.Date.nunique() rainbow_col= plt.cm.jet(np.linspace(0,1,len(countries))) for country, c in tqdm(zip(countries,rainbow_col)): df_country_train = df_max_fatality_country[df_max_fatality_country['Country_Region']==country] df_country_test = test_data[test_data['Country_Region']==country] df_country_train = df_country_train.reset_index()[df_country_train.reset_index().Date > '2020-02-22'] n_days_sans_China = df.Date.nunique() - df_country_train.Date.nunique() x_train = np.arange(1, n_days_europe+1).reshape((-1,1)) x_test = (np.arange(1,n_days_europe+n_test_days+1-overlap_days)).reshape((-1,1)) y_train_f = df_country_train['Fatalities'] #print (x_train, y_train_f) model_f = make_pipeline(PolynomialFeatures(degree=3), Ridge(fit_intercept=False)) model_f = model_f.fit(x_train, y_train_f) y_predict_f = model_f.predict(x_test) #print (x_test[-n_test_days:], y_predict_f[-n_test_days:]) y_train_c = df_country_train['ConfirmedCases'] model_c = make_pipeline(PolynomialFeatures(degree=3), Ridge(fit_intercept=False)) model_c = model_c.fit(x_train, y_train_c) y_predict_c = model_c.predict(x_test) extend_days_test = [i+len(x_test) for i in range(n_days_sans_China)] x_test = np.append(x_test, extend_days_test) y_predict_c = np.pad(y_predict_c, (n_days_sans_China, 0), 'constant') y_predict_f = np.pad(y_predict_f, (n_days_sans_China, 0), 'constant') ax0.plot(x_test[-n_test_days:], y_predict_c[-n_test_days:],linewidth=2, label='predict_'+country, color=c) ax0.plot(x_train, y_train_c, linewidth=2, color=c, linestyle='dotted', label='train_'+country) ax0.set_title(""Prediction vs Training for Confirmed Cases"") ax0.set_xlabel(""Number of days"") ax0.set_ylabel(""Confirmed Cases"") #ax0.legend(loc='center left',bbox_to_anchor=(1.0, 0.5)) #ax0.set_yscale('log') ax1.plot(x_test[-(n_test_days):], y_predict_f[-(n_test_days):],linewidth=2, label='predict_'+country, color=c) ax1.plot(x_train, y_train_f, linewidth=2, color=c, linestyle='dotted', label='train_'+country) ax1.set_title(""Prediction vs Training for Fatalities"") ax1.set_xlabel(""Number of days"") ax1.set_ylabel(""Fatalities"") ax1.legend(loc='center left',bbox_to_anchor=(1.0, 0.5)) #ax1.set_yscale('log')'",Yes,5,56.0 "from scipy.optimize.minpack import curve_fit from sklearn.metrics import r2_score from scipy.special import expit def Gompertz(a, c, t, t0): Q = a * np.exp(-np.exp(-c*(t-t0))) return Q def Boltzman(a, c, t, t0): Q = a / (1 + np.exp(-c*(t-t0))) return Q emerging_countries = ['Albania', 'Andorra', 'Argentina', 'Armenia', 'Azerbaijan', 'Bahrain', 'Barbados', 'Bhtan', 'Bulgaria', 'Burkina Faso', 'Cambodia', 'Chile', 'Colombia', 'Congo (Kinshasa)', 'Costa Rica', 'Cote dIvoire', 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Dominican Republic', 'Egypt', 'Estonia', 'Georgia', 'Greece', 'Honduras', 'Iceland', 'Iraq', 'Israel', 'Jamaica', 'Japan', 'Jordan', 'Kuwait', 'Latvia', 'Lebanon', 'Lithuania', 'Luxembourg', 'Malaysia', 'Maldives', 'Malta', 'Mauritania', 'Mauritius', 'Monaco', 'Mongolia', 'Montenegro', 'Morocco', 'Namibia', 'Nigeria', 'North Macedonia', 'Norway', 'Oman', 'Panama','Paraguay', 'Rawanda', 'Saint Lucia', 'San Marino', 'Senegal', 'Seychelles', 'Singapore','Slovakia', 'Slovenia', 'Sri Lanka', 'Thailand', 'Tunisia', 'Uganda', 'Uruguay', 'Venezuela'] def get_bounds_fatal (country, isState, y_train): x = '' for c in emerging_countries: if country == c: x = c; break maximum = max(y_train) if maximum == 0.0: maximum = 1.0 if country == 'China': lower = [0, 0.02, 0] upper = [2.0*maximum,0.16, 40] elif country == 'Iran': lower = [0, 0.00, 0] upper = [3.0*maximum,0.11, 68] elif country == 'Italy': lower = [0, 0.00, 0] upper = [3.0*maximum,0.13, 72] elif country == 'US': lower = [0, 0.02, 0] if maximum <=10:upper = [4.0*maximum, 0.30, 85] else: upper = [3.5*maximum, 0.20, 90] elif country == 'France': lower = [0, 0.02, 0] if maximum <=10:upper = [4.0*maximum,0.18, 80] else: upper = [4.0*maximum,0.15, 90] elif country == 'Spain': lower = [0, 0.02, 0] upper = [3.0*maximum,0.15, 78] elif country == 'Germany': lower = [0.0, 0.02, 0] upper = [3.0*maximum,0.20, 85] elif country == 'Belgium': lower = [0.0, 0.02, 0] upper = [3.0*maximum,0.25, 88] elif country == 'Turkey': lower = [0.0, 0.02, 0] upper = [3.5*maximum,0.22, 90] elif country == 'Netherlands': lower = [0.0, 0.02, 0] upper = [4.0*maximum,0.14, 88] elif country == 'Switzerland': lower = [0.0, 0.02, 0] upper = [4.0*maximum,0.12, 90] elif country == 'United Kingdom': lower = [0.0, 0.02, 0] upper = [4.5*maximum,0.16, 95] elif country == 'Portugal': lower = [100, 0.02, 0] upper = [4.5*maximum,0.12, 95] elif country == 'Sweden': lower = [100, 0.02, 0] upper = [4.0*maximum,0.18, 90] elif country == 'Brazil': lower = [100, 0.02, 0] upper = [3.5*maximum,0.20, 90] elif country == 'Indonesia': lower = [100, 0.02, 0] upper = [4.5*maximum,0.10, 95] elif country == 'Austria': lower = [0, 0.02, 0] upper = [4.5*maximum,0.10, 95] elif country == 'Ireland': lower = [0, 0.02, 0] upper = [4.5*maximum,0.15, 95] elif country == 'Canada': lower = [0, 0.02, 0] if maximum <=10: upper = [2.0*maximum, 0.20, 65] else: upper = [4.5*maximum, 0.16, 95] elif country == 'India': lower = [0, 0.02, 0] upper = [4.5*maximum,0.20, 95] elif country == 'Ecuador': lower = [0, 0.02, 0] upper = [4.5*maximum,0.16, 96] elif country == 'Romania': lower = [0, 0.02, 0] upper = [4.5*maximum,0.15, 95] elif country == 'Philippines': lower = [0, 0.02, 0] upper = [4.5*maximum,0.12, 95] elif country == 'Algeria': lower = [0, 0.02, 0] upper = [4.5*maximum,0.12, 95] elif country == 'Mexico': lower = [0, 0.02, 0] upper = [4.5*maximum,0.20, 95] elif country == 'Denmark': lower = [0, 0.02, 0] if maximum <=10:upper = [4.0*maximum, 0.30, 80] else: upper = [4.5*maximum,0.12, 94] elif country == 'Poland': lower = [0, 0.02, 0] upper = [4.0*maximum,0.20, 94] elif country == 'Korea, South': lower = [0, 0.02, 0] upper = [2.5*maximum,0.10, 52] elif country == 'Peru': lower = [0.0, 0.02, 0] upper = [4.5*maximum,0.18, 95] elif country == 'Australia': lower = [0, 0.02, 0] if maximum <=10: upper = [2.0*maximum, 0.20, 45] else: upper = [2.5*maximum,0.20, 70] elif country == 'Pakistan': lower = [0.0, 0.02, 0] upper = [4.5*maximum,0.12,95] elif country == 'Saudi Arabia': lower = [0.0, 0.02, 0] upper = [4.5*maximum,0.15,95] elif country == 'Afghanistan': lower = [0.0, 0.02, 0] upper = [4.5*maximum,0.12,95] elif country == 'Diamond Princess': lower = [0.0, 0.02, 0] upper = [1.0*maximum,0.50,2] elif country == 'Hungary': lower = [0.0, 0.02, 0] upper = [4.5*maximum,0.14,94] elif country == 'New Zealand': lower = [0.0, 0.02, 0] upper = [4.0*maximum,0.14,90] elif country == 'Somalia': lower = [0.0, 0.02, 0] upper = [4.5*maximum,0.10,94] elif country == x: lower = [0.0, 0.02, 0] upper = [3.5*maximum,0.15,85] else: lower = [0.0, 0.02, 0] if isState: if maximum <=10:upper = [4.0*maximum,0.30,80] else: upper = [4.5*maximum,0.15,80] else: if maximum <=10:upper = [4.0*maximum,0.60,85] else: upper = [4.5*maximum,0.18,95] return lower, upper def get_bounds_confirm (country, isState, y_train): x = '' for c in emerging_countries: if country == c: x = c; break maximum = max(y_train) if maximum == 0.0: maximum = 1.0 if country == 'China': lower = [0, 0.02, 0] upper = [2.0*maximum,0.20,30] elif country == 'Iran': lower = [0, 0.00, 0] upper = [3.0*maximum,0.12,70] elif country == 'Italy': lower = [0, 0.00, 0] upper = [3.0*maximum,0.12, 70] elif country == 'US': lower = [0, 0.02, 0] if maximum <=10:upper = [4.0*maximum, 0.30, 80] else: upper = [3.0*maximum, 0.18, 85] elif country == 'France': lower = [0, 0.02, 0] if maximum <=10:upper = [4.0*maximum, 0.15, 80] else: upper = [4.5*maximum, 0.10, 90] elif country == 'Spain': lower = [0, 0.02, 0] upper = [3.0*maximum,0.13, 75] elif country == 'Germany': lower = [0, 0.02, 0] upper = [3.0*maximum,0.13, 75] elif country == 'Belgium': lower = [0, 0.02, 0] upper = [3.0*maximum,0.15, 78] elif country == 'Turkey': lower = [0, 0.02, 0] upper = [3.5*maximum,0.20, 90] elif country == 'Netherlands': lower = [0, 0.02, 0] upper = [4.0*maximum,0.10, 88] elif country == 'Switzerland': lower = [0, 0.02, 0] upper = [3.5*maximum,0.10, 75] elif country == 'United Kingdom': lower = [0, 0.02, 0] upper = [4.5*maximum,0.12, 95] elif country == 'Portugal': lower = [0, 0.02, 0] upper = [4.0*maximum,0.11, 88] elif country == 'Sweden': lower = [0, 0.02, 0] upper = [4.0*maximum,0.10, 88] elif country == 'Brazil': lower = [0, 0.02, 0] upper = [3.5*maximum,0.18, 88] elif country == 'Indonesia': lower = [0, 0.02, 0] upper = [5.5*maximum,0.09, 100] elif country == 'Austria': lower = [0, 0.02, 0] upper = [3.5*maximum,0.12, 75] elif country == 'Ireland': lower = [0, 0.02, 0] upper = [4.5*maximum,0.12, 95] elif country == 'Canada': lower = [0, 0.02, 0] if maximum <=10: upper = [3.0*maximum, 0.28, 75] else: upper = [4.5*maximum, 0.12, 93] elif country == 'India': lower = [0, 0.02, 0] upper = [4.5*maximum,0.16, 96] elif country == 'Ecuador': lower = [0, 0.02, 0] upper = [4.5*maximum,0.20, 95] elif country == 'Romania': lower = [0, 0.02, 0] upper = [4.5*maximum,0.11, 93] elif country == 'Philippines': lower = [0, 0.02, 0] upper = [5.5*maximum,0.12, 95] elif country == 'Algeria': lower = [0, 0.02, 0] upper = [5.5*maximum,0.10, 98] elif country == 'Mexico': lower = [100, 0.02, 0] upper = [4.5*maximum,0.15, 95] elif country == 'Denmark': lower = [0, 0.02, 0] if isState: if maximum <= 10: upper = [2.0*maximum,0.20,80] else: upper = [2.5*maximum,0.25, 55] else: if maximum <=10: upper = [2.0*maximum,0.30, 40] else: upper = [5.5*maximum,0.06, 100] elif country == 'Poland': lower = [0, 0.02, 0] upper = [4.5*maximum,0.11, 94] elif country == 'Korea, South': lower = [0, 0.02, 0] upper = [2.0*maximum,0.25, 18] elif country == 'Peru': lower = [0, 0.02, 0] upper = [4.5*maximum,0.20, 96] elif country == 'Australia': lower = [0, 0.02, 0] if maximum <=10: upper = [2.0*maximum, 0.25, 45] else: upper = [2.5*maximum,0.18, 65] elif country == 'Pakistan': lower = [0.0, 0.02, 0] upper = [4.5*maximum,0.10,94] elif country == 'Saudi Arabia': lower = [0.0, 0.02, 0] upper = [4.5*maximum,0.10,94] elif country == 'Afghanistan': lower = [0.0, 0.02, 0] upper = [4.5*maximum,0.12,94] elif country == 'Diamond Princess': lower = [0.0, 0.02, 0] upper = [1.0*maximum,1.0,1.0] elif country == 'Hungary': lower = [0.0, 0.02, 0] upper = [4.5*maximum,0.10,94] elif country == 'New Zealand': lower = [0.0, 0.02, 0] upper = [4.5*maximum,0.15,85] elif country == 'Somalia': lower = [0.0, 0.02, 0] upper = [1.0*maximum,0.08,50] elif country == x: lower = [0.0, 0.02, 0] upper = [3.5*maximum,0.10,80] else: lower = [0.0, 0.02, 0] if isState: if maximum <= 200: upper = [2.0*maximum,0.20,80] else: upper = [4.5*maximum,0.20,80] else: if maximum <= 200: upper = [3.0*maximum,0.20,85] else: upper = [4.5*maximum,0.20,96] return lower, upper plt.rcParams.update({'font.size': 12}) fig,(ax0,ax1) = plt.subplots(1,2,figsize=(20, 8)) fig,(ax2,ax3) = plt.subplots(1,2,figsize=(20, 8)) rainbow_col= plt.cm.jet(np.linspace(0,1,len(countries))) for country, c in tqdm(zip(countries,rainbow_col)): #print('\ \ \ \ country ==>', country) df_country_train = df_max_fatality_country[df_max_fatality_country['Country_Region']==country] df_country_test = test_data[test_data['Country_Region']==country] if country != 'China': df_country_train = df_country_train.reset_index().loc[df_country_train.reset_index().Date>'2020-02-22'] #17 n_days_sans_China =train_data.Date.nunique() - df_country_train.Date.nunique() else: df_country_train = df_country_train.reset_index() n_days_sans_China = 0 n_train_days =df_country_train.Date.nunique() x_train = range(n_train_days) x_test = range(n_train_days+n_test_days-overlap_days)#n_test_days+overlap_days) y_train_f = df_country_train['Fatalities'] y_train_c = df_country_train['ConfirmedCases'] y_train_cn = (df_country_train['ConfirmedCases'] - df_country_train['ConfirmedCases'].shift(1)).fillna(0.0).replace([-np.inf, np.inf], 0.0) y_train_fn = (df_country_train['Fatalities'] - df_country_train['Fatalities'].shift(1)).fillna(0.0).replace([-np.inf, np.inf], 0.0) ###### Fatalities: lower, upper = get_bounds_fatal (country, 0, y_train_f) popt_f, pcov_f = curve_fit(Gompertz, x_train, y_train_f, method='trf', bounds=(lower,upper)) a_max, estimated_c, estimated_t0 = popt_f y_predict_f = Gompertz(a_max, estimated_c, x_test, estimated_t0) y_predict_f_at_t0 = Gompertz(a_max, estimated_c, estimated_t0, estimated_t0) #print('\ fatalities ==>, max: ',a_max, ', slope: %.2f'% estimated_c, ', inflection point: ', # estimated_t0, ', r2 score: %.2f'% r2_score(y_train_f[:], y_predict_f[0:n_train_days])) y_fn = np.array([]) fn = [y_predict_f[i]-y_predict_f[i-1] if i!=0 else y_predict_f[i] for i in range(len(y_predict_f))] y_predict_fn = np.append(y_fn, fn) ###### Confirmed cases: lower_c,upper_c = get_bounds_confirm (country, 0, y_train_c) popt_c, pcov_c = curve_fit(Gompertz, x_train, y_train_c, method='trf', bounds=(lower_c,upper_c)) a_max_c, estimated_c_c, estimated_t0_c = popt_c y_predict_c = Gompertz(a_max_c, estimated_c_c, x_test, estimated_t0_c) y_predict_c_at_t0 = Gompertz(a_max_c, estimated_c_c, estimated_t0_c, estimated_t0_c) #print('confirmed ==> max: ',a_max_c, ', slope: %.2f'% estimated_c_c, ', inflection point: ', # estimated_t0_c, ', r2 score: %.2f'% r2_score(y_train_c[:], y_predict_c[0:n_train_days])) y_cn = np.array([]) cn = [y_predict_c[i]-y_predict_c[i-1] if i!=0 else y_predict_c[i] for i in range(len(y_predict_c))] y_predict_cn = np.append(y_cn, cn) ## ===== Move the x-axis of trained and test datasets to allign with dates in China ====== extend_days_test = [i+len(x_test) for i in range(n_days_sans_China)] x_test = np.append(x_test, extend_days_test) y_predict_c = np.pad(y_predict_c, (n_days_sans_China, 0), 'constant') y_predict_cn = np.pad(y_predict_cn,(n_days_sans_China, 0), 'constant') y_predict_f = np.pad(y_predict_f, (n_days_sans_China, 0), 'constant') y_predict_fn = np.pad(y_predict_fn, (n_days_sans_China, 0), 'constant') inflection_c = estimated_t0_c+n_days_sans_China extend_days_train = [i+len(x_train) for i in range(n_days_sans_China)] x_train = np.append(x_train, extend_days_train) y_train_c = np.pad(y_train_c, (n_days_sans_China, 0), 'constant') y_train_cn = np.pad(y_train_cn, (n_days_sans_China, 0), 'constant') y_train_f = np.pad(y_train_f, (n_days_sans_China, 0), 'constant') y_train_fn = np.pad(y_train_fn, (n_days_sans_China, 0), 'constant') inflection_f = estimated_t0+n_days_sans_China ## ===== Plot ======= ax0.plot(x_test, y_predict_c, linewidth=2, label=country, color=c) ax0.plot(inflection_c, y_predict_c_at_t0, marker='o', markersize=6, color='green')#, label='inflection') ax0.plot(x_train, y_train_c, linewidth=2, color=c,linestyle='dotted')#, label='train_'+country) ax0.set_title(""Total Confirmed Cases"") ax0.set_xlabel(""Number of days"") ax0.set_ylabel(""Confirmed Cases"") ax0.legend()#loc='center left',bbox_to_anchor=(1.0, 0.5)) ax1.plot(x_test, y_predict_f, linewidth=2, label=country,color=c) ax1.plot(inflection_f, y_predict_f_at_t0, marker='o', markersize=6, color='green') ax1.plot(x_train, y_train_f, linewidth=2,color=c, linestyle='dotted')#, label='train_'+country) ax1.set_title(""Total Fatalities"") ax1.set_xlabel(""Number of days"") ax1.set_ylabel(""Fatalities"") ax1.legend()#loc='center left',bbox_to_anchor=(1.0, 0.5)) ax2.plot(x_test, y_predict_cn, linewidth=2, label=country, color=c) ax2.scatter(x_train, y_train_cn, linewidth=2, color=c, linestyle='dotted')#, label='train_'+country) ax2.set_title(""New Confirmed Cases"") ax2.set_xlabel(""Number of days"") ax2.set_ylabel(""New Confirmed Cases"") ax2.legend()#loc='center left',bbox_to_anchor=(1.0, 0.5)) ax3.plot(x_test, y_predict_fn, linewidth=2, label=country, color=c) ax3.scatter(x_train, y_train_fn, linewidth=2, color=c, linestyle='dotted')#, label='train_'+country) ax3.set_title(""New Fatalities"") ax3.set_xlabel(""Number of days"") ax3.set_ylabel(""New Fatalities"") ax3.legend()#loc='center left',bbox_to_anchor=(1.0, 0.5))'",Yes,3,33.0 "nCountries= train_data['Country_Region'].unique() isState = bool x_train = range(n_train_days) x_test = range(n_train_days+n_test_days-overlap_days) for country in tqdm(nCountries): fig,(ax0,ax1) = plt.subplots(1,2,figsize=(20,8)) fig,(ax2,ax3) = plt.subplots(1,2,figsize=(20,8)) #print('\ \ \ \ country ==>', country) df_country_train = train_data[train_data['Country_Region']==country] df_country_test = test_data[test_data['Country_Region']==country] if country != 'China': df_country_train = df_country_train.reset_index().loc[df_country_train.reset_index().Date>'2020-02-22'] #17 n_days_sans_China =train_data.Date.nunique() - df_country_train.Date.nunique() else: df_country_train = df_country_train.reset_index() n_days_sans_China = 0 n_train_days =df_country_train.Date.nunique() x_train = range(n_train_days) x_test = range(n_train_days+n_test_days-overlap_days) nvalues = df_country_train['Province_State'].isna().nunique() #fix for problem with Denmark data if (df_country_train['Province_State'].isna().unique()==True).any() and nvalues<2: isState = False y_train_f = df_country_train['Fatalities'] y_train_c = df_country_train['ConfirmedCases'] y_train_cn = (df_country_train['ConfirmedCases'] - df_country_train['ConfirmedCases'].shift(1)).fillna(0.0) y_train_fn = (df_country_train['Fatalities'] - df_country_train['Fatalities'].shift(1)).fillna(0.0) if y_train_f.empty == False: lower, upper = get_bounds_fatal (country, isState, y_train_f) #print(lower, upper) popt_f, pcov_f = curve_fit(Gompertz, x_train, y_train_f, method='trf', bounds=(lower,upper)) a_max, estimated_c, estimated_t0 = popt_f y_predict_f = Gompertz(a_max, estimated_c, x_test, estimated_t0) #print('\ fatalities ==>, max: ',a_max, ', slope: %.2f'% estimated_c, ', inflection point: ', # estimated_t0, ', r2 score: %.2f'% r2_score(y_train_f[:], y_predict_f[0:n_train_days])) y_fn = np.array([]) fn = [y_predict_f[i]-y_predict_f[i-1] if i!=0 else y_predict_f[i] for i in range(len(y_predict_f))] y_predict_fn = np.append(y_fn, fn) if y_train_c.empty == False: lower_c, upper_c = get_bounds_confirm (country, isState, y_train_c) #print(lower_c, upper_c) popt_c, pcov_c = curve_fit(Gompertz, x_train, y_train_c, method='trf', bounds=(lower_c,upper_c)) a_max_c, estimated_c_c, estimated_t0_c = popt_c y_predict_c = Gompertz(a_max_c, estimated_c_c, x_test, estimated_t0_c) #print('\ confirmed ==> max: ',a_max_c, ', slope: %.2f'% estimated_c_c, ', inflection point: ', # estimated_t0_c, ', r2 score: %.2f'% r2_score(y_train_c[:], y_predict_c[0:n_train_days])) y_cn = np.array([]) cn = [y_predict_c[i]-y_predict_c[i-1] if i!=0 else y_predict_c[i] for i in range(len(y_predict_c))] y_predict_cn = np.append(y_cn, cn) ## ===== Move the x-axis of trained and test datasets to allign with dates in China ====== extend_days_test = [i+len(x_test) for i in range(n_days_sans_China)] x_test = np.append(x_test, extend_days_test) y_predict_c = np.pad(y_predict_c, (n_days_sans_China, 0), 'constant') y_predict_cn = np.pad(y_predict_cn,(n_days_sans_China, 0), 'constant') y_predict_f = np.pad(y_predict_f, (n_days_sans_China, 0), 'constant') inflection_f = estimated_t0+n_days_sans_China y_predict_fn = np.pad(y_predict_fn, (n_days_sans_China, 0), 'constant') extend_days_train = [i+len(x_train) for i in range(n_days_sans_China)] x_train = np.append(x_train, extend_days_train) y_train_c = np.pad(y_train_c, (n_days_sans_China, 0), 'constant') y_train_cn = np.pad(y_train_cn, (n_days_sans_China, 0), 'constant') y_train_f = np.pad(y_train_f, (n_days_sans_China, 0), 'constant') y_train_fn = np.pad(y_train_fn, (n_days_sans_China, 0), 'constant') inflection_c = estimated_t0_c+n_days_sans_China ax0.plot(x_test, y_predict_c, linewidth=2, label='predict_'+country) ax0.plot(x_train, y_train_c, linewidth=2, color='r', linestyle='dotted', label='train_'+country) ax0.set_title(""Prediction vs Training for Confirmed Cases"") ax0.set_xlabel(""Number of days"") ax0.set_ylabel(""Confirmed Cases"") ax0.legend() test_data.loc[test_data['Country_Region']==country,'ConfirmedCases'] = y_predict_c[-n_test_days:] ax1.plot(x_test, y_predict_f, linewidth=2, label='predict_'+country) ax1.plot(x_train, y_train_f, linewidth=2, color='r', linestyle='dotted', label='train_'+country) ax1.set_title(""Prediction vs Training for Fatalities"") ax1.set_xlabel(""Number of days"") ax1.set_ylabel(""Fatalities"") ax1.legend() test_data.loc[test_data['Country_Region']==country,'Fatalities'] = y_predict_f[-n_test_days:] ax2.plot(x_test, y_predict_cn, linewidth=2, label='predict_'+country) ax2.scatter(x_train, y_train_cn, linewidth=2, color='r', linestyle='dotted', label='train_'+country) ax2.set_title(""New Confirmed Cases"") ax2.set_xlabel(""Number of days"") ax2.set_ylabel(""New Confirmed Cases"") ax2.legend()#loc='center left',bbox_to_anchor=(1.0, 0.5)) ax3.plot(x_test, y_predict_fn, linewidth=2, label='predict_'+country) ax3.scatter(x_train, y_train_fn, linewidth=2, color='r', linestyle='dotted', label='train_'+country) ax3.set_title(""New Fatalities"") ax3.set_xlabel(""Number of days"") ax3.set_ylabel(""New Fatalities"") ax3.legend()#loc='center left',bbox_to_anchor=(1.0, 0.5)) else: # use Province/State data when available isState = True state_list = [] y_predict_c_dict = {}; y_train_c_dict = {} y_predict_cn_dict = {}; y_train_cn_dict = {} y_predict_f_dict = {}; y_train_f_dict = {} y_predict_fn_dict = {}; y_train_fn_dict = {} for state in df_country_train['Province_State'].unique(): df_state_train = df_country_train[df_country_train['Province_State']==state] #state df_state_test = df_country_test[df_country_test['Province_State']==state] state_list.append(state) y_train_f = df_state_train['Fatalities'] y_train_c = df_state_train['ConfirmedCases'] y_train_cn = (df_state_train['ConfirmedCases'] - df_state_train['ConfirmedCases'].shift(1)).fillna(0.0) y_train_fn = (df_state_train['Fatalities'] - df_state_train['Fatalities'].shift(1)).fillna(0.0) if y_train_f.empty== False: lower, upper = get_bounds_fatal (country, isState, y_train_f) popt_f, pcov_f = curve_fit(Gompertz, x_train, y_train_f, method='trf', bounds=(lower,upper)) a_max, estimated_c, estimated_t0 = popt_f y_predict_f = Gompertz(a_max, estimated_c, x_test, estimated_t0) y_predict_f_dict[state] = y_predict_f y_train_f_dict[state] = y_train_f #print('\ fatalities state ==>, max: ',a_max, ', slope: %.2f'% estimated_c, ', inflection point: ', # estimated_t0, ', r2 score: %.2f'% r2_score(y_train_f[:], y_predict_f[0:70])) y_fn = np.array([]) fn = [y_predict_f[i]-y_predict_f[i-1] if i!=0 else y_predict_f[i] for i in range(len(y_predict_f))] y_predict_fn = np.append(y_fn, fn) y_predict_fn_dict[state] = y_predict_fn y_train_fn_dict[state] = y_train_fn if y_train_c.empty == False: lower_c, upper_c = get_bounds_confirm (country, isState, y_train_c) popt_c, pcov_c = curve_fit(Gompertz, x_train, y_train_c, method='trf', bounds=(lower_c,upper_c)) a_max_c, estimated_c_c, estimated_t0_c = popt_c y_predict_c = Gompertz(a_max_c, estimated_c_c, x_test, estimated_t0_c) y_predict_c_dict[state] = y_predict_c y_train_c_dict[state] = y_train_c #print('\ confirmed state ==> max: ',a_max_c, ', slope: %.2f'% estimated_c_c, ', inflection point: ', # estimated_t0_c, ', r2 score: %.2f'% r2_score(y_train_c[:], y_predict_c[0:70])) y_cn = np.array([]) cn = [y_predict_c[i]-y_predict_c[i-1] if i!=0 else y_predict_c[i] for i in range(len(y_predict_c))] y_predict_cn = np.append(y_cn, cn) y_predict_cn_dict[state] = y_predict_cn y_train_cn_dict[state] = y_train_cn ## ====== Plot and Store the Results: ====== ## ====== Move the x-axis of trained and test datasets to allign with dates in China ====== extend_days_test = [i+len(x_test) for i in range(n_days_sans_China)] x_test = np.append(x_test, extend_days_test) extend_days_train = [i+len(x_train) for i in range(n_days_sans_China)] x_train = np.append(x_train, extend_days_train) for state, y_predict in y_predict_f_dict.items(): y_predict = np.pad(y_predict, (n_days_sans_China, 0), 'constant') ax1.plot(x_test, y_predict, linewidth=2, label=country+'_'+state) ax1.legend(loc='center left',bbox_to_anchor=(1.0, 0.5)) test_data.loc[(test_data['Country_Region']==country)&(test_data['Province_State']==state),'Fatalities'] = y_predict[-n_test_days:] for state, y_train in y_train_f_dict.items(): y_train = np.pad(y_train, (n_days_sans_China, 0), 'constant') ax1.plot(x_train, y_train, linewidth=2, color='r', linestyle='dotted', label='train_'+state) ax1.set_title(""Prediction vs Training for Fatalities"") ax1.set_xlabel(""Number of days"") ax1.set_ylabel(""Fatalities"") for state, y_predict in y_predict_c_dict.items(): y_predict = np.pad(y_predict, (n_days_sans_China, 0), 'constant') ax0.plot(x_test, y_predict, linewidth=2, label=country+'_'+state) #ax0.legend(loc='center left',bbox_to_anchor=(1.0, 0.5)) test_data.loc[(test_data['Country_Region']==country)&(test_data['Province_State']==state),'ConfirmedCases'] = y_predict[-n_test_days:] for state, y_train in y_train_c_dict.items(): y_train = np.pad(y_train, (n_days_sans_China, 0), 'constant') ax0.plot(x_train, y_train, linewidth=2, color='r', linestyle='dotted', label='train_'+country+'_'+state) ax0.set_title(""Prediction vs Training for ConfirmedCases"") ax0.set_xlabel(""Number of days"") ax0.set_ylabel(""Confirmed Cases"") for state, y_predict in y_predict_fn_dict.items(): y_predict = np.pad(y_predict, (n_days_sans_China, 0), 'constant') ax3.plot(x_test, y_predict, linewidth=2, label=country+'_'+state) ax3.legend(loc='center left',bbox_to_anchor=(1.0, 0.5)) for state, y_train in y_train_fn_dict.items(): y_train = np.pad(y_train, (n_days_sans_China, 0), 'constant') ax3.scatter(x_train, y_train, linewidth=2, color='r', linestyle='dotted', label='train_'+state) ax3.set_title(""New Fatalities"") ax3.set_xlabel(""Number of days"") ax3.set_ylabel(""New Fatalities"") for state, y_predict in y_predict_cn_dict.items(): y_predict = np.pad(y_predict, (n_days_sans_China, 0), 'constant') ax2.plot(x_test, y_predict, linewidth=2, label=country+'_'+state) #ax2.legend(loc='center left',bbox_to_anchor=(1.0, 0.5)) test_data.loc[(test_data['Country_Region']==country)&(test_data['Province_State']==state),'ConfirmedCases'] = y_predict[-n_test_days:] for state, y_train in y_train_cn_dict.items(): y_train = np.pad(y_train, (n_days_sans_China, 0), 'constant') ax2.scatter(x_train, y_train, linewidth=2, color='r', linestyle='dotted', label='train_'+country+'_'+state) ax2.set_title(""New Confirmed Cases"") ax2.set_xlabel(""Number of days"") ax2.set_ylabel(""New Confirmed Cases"")'",Yes,3,56.0 "submit_data = pd.read_csv(""/kaggle/input/covid19-global-forecasting-week-4/submission.csv"")#, index_col=0) test_data['Fatalities'] = test_data['Fatalities'].fillna(0.0).astype(int) test_data['ConfirmedCases'] = test_data['ConfirmedCases'].fillna(0.0).astype(int) submit_data['Fatalities'] = test_data['Fatalities'].astype('int') submit_data['ConfirmedCases'] = test_data['ConfirmedCases'].astype('int') submit_data.to_csv('submission.csv', index=False) submit_data.head()'",Yes,4,25.0 display(submit_data.describe()),No,5,40.0 "import numpy as np import pandas as pd from sklearn.model_selection import train_test_split df = pd.read_csv('../input/44352/training_solutions_rev1.csv') df_train, df_test = train_test_split(df, test_size=.2) df_train.shape, df_test.shape",No,3,45.0 "from skimage.transform import resize from tqdm import tqdm import matplotlib.pyplot as plt %matplotlib inline ORIG_SHAPE = (424,424) CROP_SIZE = (256,256) IMG_SHAPE = (64,64) def get_image(path, x1,y1, shape, crop_size): x = plt.imread(path) x = x[x1:x1+crop_size[0], y1:y1+crop_size[1]] x = resize(x, shape) x = x/255. return x def get_all_images(dataframe, shape=IMG_SHAPE, crop_size=CROP_SIZE): x1 = (ORIG_SHAPE[0]-CROP_SIZE[0])//2 y1 = (ORIG_SHAPE[1]-CROP_SIZE[1])//2 sel = dataframe.values ids = sel[:,0].astype(int).astype(str) y_batch = sel[:,1:] x_batch = [] for i in tqdm(ids): x = get_image('../input/44352/images_training_rev1/'+i+'.jpg', x1,y1, shape=shape, crop_size=crop_size) x_batch.append(x) x_batch = np.array(x_batch) return x_batch, y_batch X_train, y_train = get_all_images(df_train) X_test, y_test = get_all_images(df_test)",Yes,3,44.0 "from keras.models import Sequential from keras.layers import Conv2D, MaxPooling2D from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization, GlobalMaxPooling2D from keras import backend as K def root_mean_squared_error(y_true, y_pred): return K.sqrt(K.mean(K.square(y_pred - y_true))) model = Sequential() model.add(Conv2D(512, (3, 3), input_shape=(IMG_SHAPE[0], IMG_SHAPE[1], 3))) model.add(Conv2D(256, (3, 3))) #model.add(BatchNormalization()) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Conv2D(256, (3, 3))) model.add(Conv2D(128, (3, 3))) #model.add(BatchNormalization()) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Conv2D(128, (3, 3))) model.add(Conv2D(128, (3, 3))) #model.add(BatchNormalization()) model.add(Activation('relu')) model.add(GlobalMaxPooling2D()) model.add(Dropout(0.25)) model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.25)) model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.25)) model.add(Dense(128)) model.add(Activation('relu')) model.add(Dropout(0.25)) model.add(Dense(37)) model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adamax', metrics=[root_mean_squared_error]) model.summary()",Yes,5,4.0 "batch_size = 128 model.fit(X_train, y_train, epochs=30, validation_data=(X_test, y_test))",No,5,7.0 "import os from tqdm import tqdm def test_image_generator(ids, shape=IMG_SHAPE): x1 = (ORIG_SHAPE[0]-CROP_SIZE[0])//2 y1 = (ORIG_SHAPE[1]-CROP_SIZE[1])//2 x_batch = [] for i in ids: x = get_image('../input/44352/images_test_rev1/'+i, x1, y1, shape=IMG_SHAPE, crop_size=CROP_SIZE) x_batch.append(x) x_batch = np.array(x_batch) return x_batch val_files = os.listdir('../input/44352/images_test_rev1/') val_predictions = [] N_val = len(val_files) for i in tqdm(np.arange(0, N_val, batch_size)): if i+batch_size > N_val: upper = N_val else: upper = i+batch_size X = test_image_generator(val_files[i:upper]) y_pred = model.predict(X) val_predictions.append(y_pred) val_predictions = np.array(val_predictions) Y_pred = np.vstack(val_predictions) ids = np.array([v.split('.')[0] for v in val_files]).reshape(len(val_files),1) submission_df = pd.DataFrame(np.hstack((ids, Y_pred)), columns=df.columns) submission_df = submission_df.sort_values(by=['GalaxyID']) submission_df.to_csv('sample_submission.csv', index=False)",Yes,2,48.0 "import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import os from catboost.datasets import amazon train, test = amazon() print(train.shape, test.shape) target = ""ACTION"" col4train = [x for x in train.columns if x not in [target, ""ROLE_TITLE""]] y = train[target].values",No,3,58.0 "from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import cross_validate # returns model instance def get_model(): params = { ""n_estimators"":300, ""n_jobs"": 3, ""random_state"":5436, } return ExtraTreesClassifier(**params) # validate model on given dataset and report CV score def validate_model(model, data): skf = StratifiedKFold(n_splits=5, random_state = 4141, shuffle = True) stats = cross_validate( model, data[0], data[1], groups=None, scoring='roc_auc', cv=skf, n_jobs=None, return_train_score = True ) stats = pd.DataFrame(stats) return stats.describe().transpose() # transforms given train and test datasets using provided function, # function parameters can be passed as a dict def transform_dataset(train, test, func, func_params = {}): dataset = pd.concat([train, test], ignore_index = True) dataset = func(dataset, **func_params) if isinstance(dataset, pd.DataFrame): new_train = dataset.iloc[:train.shape[0],:].reset_index(drop = True) new_test = dataset.iloc[train.shape[0]:,:].reset_index(drop = True) else: new_train = dataset[:train.shape[0]] new_test = dataset[train.shape[0]:] return new_train, new_test'",Yes,3,28.0 "MJTCP = 32292 #Michael Jordan total career points #for each column in dataset creates N column with random integers def assign_rnd_integer(dataset, number_of_times = 5, seed = MJTCP): new_dataset = pd.DataFrame() np.random.seed(seed) for c in dataset.columns: for i in range(number_of_times): col_name = c+""_""+str(i) unique_vals = dataset[c].unique() labels = np.array(list(range(len(unique_vals)))) np.random.shuffle(labels) mapping = pd.DataFrame({c: unique_vals, col_name: labels}) new_dataset[col_name] = (dataset[[c]] .merge(mapping, on = c, how = 'left')[col_name] ).values return new_dataset'",Yes,3,12.0 "new_train, new_test = transform_dataset( train[col4train], test[col4train], assign_rnd_integer, {""number_of_times"":5} ) print(new_train.shape, new_test.shape) new_train.head(5)",Yes,3,41.0 "validate_model( model = get_model(), data = [new_train.values, y] )",No,3,28.0 "new_train, new_test = transform_dataset( train[col4train], test[col4train], assign_rnd_integer, {""number_of_times"":1} ) print(new_train.shape, new_test.shape) validate_model( model = get_model(), data = [new_train.values, y] )",Yes,3,28.0 "new_train, new_test = transform_dataset( train[col4train], test[col4train], assign_rnd_integer, {""number_of_times"":10} ) print(new_train.shape, new_test.shape) validate_model( model = get_model(), data = [new_train.values, y] )",Yes,4,28.0 "from sklearn.preprocessing import OneHotEncoder # transforms given dataset to OHE representation def one_hot(dataset): ohe = OneHotEncoder(sparse=True, dtype=np.float32, handle_unknown='ignore') return ohe.fit_transform(dataset.values)",Yes,5,20.0 "new_train, new_test = transform_dataset( train[col4train], test[col4train], one_hot) print(new_train.shape, new_test.shape)",No,3,12.0 "from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer def extract_col_interaction(dataset, col1, col2, tfidf = True): data = dataset.groupby([col1])[col2].agg(lambda x: "" "".join(list([str(y) for y in x]))) if tfidf: vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split("" "")) else: vectorizer = CountVectorizer(tokenizer=lambda x: x.split("" "")) data_X = vectorizer.fit_transform(data) dim_red = TruncatedSVD(n_components=1, random_state = 5115) data_X = dim_red.fit_transform(data_X) result = pd.DataFrame() result[col1] = data.index.values result[col1+""_{}_svd"".format(col2)] = data_X.ravel() return result import itertools def get_col_interactions_svd(dataset, tfidf = True): new_dataset = pd.DataFrame() for col1,col2 in itertools.permutations(dataset.columns, 2): data = extract_col_interaction(dataset, col1,col2, tfidf) col_name = [x for x in data.columns if ""svd"" in x][0] new_dataset[col_name] = dataset[[col1]].merge(data, on = col1, how = 'left')[col_name] return new_dataset'",Yes,3,8.0 "new_train, new_test = transform_dataset( train[col4train], test[col4train], get_col_interactions_svd ) print(new_train.shape, new_test.shape) new_train.head(5)",Yes,3,41.0 "def get_freq_encoding(dataset): new_dataset = pd.DataFrame() for c in dataset.columns: data = dataset.groupby([c]).size().reset_index() new_dataset[c+""_freq""] = dataset[[c]].merge(data, on = c, how = ""left"")[0] return new_dataset",No,3,12.0 "new_train, new_test = transform_dataset( train[col4train], test[col4train], get_freq_encoding ) print(new_train.shape, new_test.shape) new_train.head(5)",Yes,3,41.0 "new_train1, new_test1 = transform_dataset( train[col4train], test[col4train], get_freq_encoding ) new_train2, new_test2 = transform_dataset( train[col4train], test[col4train], get_col_interactions_svd ) new_train3, new_test3 = transform_dataset( train[col4train], test[col4train], assign_rnd_integer, {""number_of_times"":10} ) new_train = pd.concat([new_train1, new_train2, new_train3], axis = 1) new_test = pd.concat([new_test1, new_test2, new_test3], axis = 1) print(new_train.shape, new_test.shape)",Yes,4,11.0 "model = get_model() model.fit(new_train.values, y) predictions = model.predict_proba(new_test)[:,1] submit = pd.DataFrame() submit[""Id""] = test[""id""] submit[""ACTION""] = predictions submit.to_csv(""submission.csv"", index = False)",Yes,4,25.0 "# Assuring you have the most recent CatBoost release !pip install catboost -U",No,5,87.0 "# Getting useful tabular processing and generator functions !git clone https://github.com/lmassaron/deep_learning_for_tabular_data.git",No,2,23.0 "# Importing core libraries import numpy as np import pandas as pd from time import time import pprint import joblib # Suppressing warnings because of skopt verbosity import warnings warnings.filterwarnings(""ignore"") # Classifiers from catboost import CatBoostClassifier, Pool # Model selection from sklearn.model_selection import StratifiedKFold # Metrics from sklearn.metrics import roc_auc_score, average_precision_score from sklearn.metrics import make_scorer",No,5,23.0 "# Loading data directly from CatBoost from catboost.datasets import amazon X, Xt = amazon() y = X[""ACTION""].apply(lambda x: 1 if x == 1 else 0).values X.drop([""ACTION""], axis=1, inplace=True)",Yes,4,21.0 "# Transforming all the labels of all variables from sklearn.preprocessing import LabelEncoder label_encoders = [LabelEncoder() for _ in range(X.shape[1])] for col, column in enumerate(X.columns): label_encoders[col].fit(X[column].append(Xt[column])) X[column] = label_encoders[col].transform(X[column]) Xt[column] = label_encoders[col].transform(Xt[column])",Yes,5,20.0 "# Enconding frequencies instead of labels (so we have some numeric variables) def frequency_encoding(column, df, df_test=None): frequencies = df[column].value_counts().reset_index() df_values = df[[column]].merge(frequencies, how='left', left_on=column, right_on='index').iloc[:,-1].values if df_test is not None: df_test_values = df_test[[column]].merge(frequencies, how='left', left_on=column, right_on='index').fillna(1).iloc[:,-1].values else: df_test_values = None return df_values, df_test_values for column in X.columns: train_values, test_values = frequency_encoding(column, X, Xt) X[column+'_counts'] = train_values Xt[column+'_counts'] = test_values",No,5,20.0 "# Pointing out which variables are categorical and which are numeric categorical_variables = [col for col in X.columns if '_counts' not in col] numeric_variables = [col for col in X.columns if '_counts' in col]",No,3,77.0 X.head(),No,5,41.0 Xt.head(),No,5,41.0 "# Counting unique values of categorical variables X[categorical_variables].nunique()",No,5,54.0 "# Describing numeric variables X[numeric_variables].describe()",No,5,40.0 "# Initializing a CatBoostClassifier with best parameters best_params = {'bagging_temperature': 0.6, 'border_count': 200, 'depth': 8, 'iterations': 350, 'l2_leaf_reg': 30, 'learning_rate': 0.30, 'random_strength': 0.01, 'scale_pos_weight': 0.48} catb = CatBoostClassifier(**best_params, loss_function='Logloss', eval_metric = 'AUC', nan_mode='Min', thread_count=2, verbose = False)",No,4,4.0 "# CV interations roc_auc = list() average_precision = list() oof = np.zeros(len(X)) best_iteration = list() for train_idx, test_idx in skf.split(X, y): X_train, y_train = X.iloc[train_idx, :], y[train_idx] X_test, y_test = X.iloc[test_idx, :], y[test_idx] train = Pool(data=X_train, label=y_train, feature_names=list(X_train.columns), cat_features=categorical_variables) test = Pool(data=X_test, label=y_test, feature_names=list(X_test.columns), cat_features=categorical_variables) catb.fit(train, verbose_eval=100, early_stopping_rounds=50, eval_set=test, use_best_model=True, #task_type = ""GPU"", plot=False) best_iteration.append(catb.best_iteration_) preds = catb.predict_proba(X_test) oof[test_idx] = preds[:,1] roc_auc.append(roc_auc_score(y_true=y_test, y_score=preds[:,1])) average_precision.append(average_precision_score(y_true=y_test, y_score=preds[:,1]))",Yes,2,7.0 "# Using catboost on all the data for predictions best_params = {'bagging_temperature': 0.6, 'border_count': 200, 'depth': 8, 'iterations': int(np.median(best_iteration) * 1.3), 'l2_leaf_reg': 30, 'learning_rate': 0.30, 'random_strength': 0.01, 'scale_pos_weight': 0.48} catb = CatBoostClassifier(**best_params, loss_function='Logloss', eval_metric = 'AUC', nan_mode='Min', thread_count=2, verbose = False) train = Pool(data=X, label=y, feature_names=list(X_train.columns), cat_features=categorical_variables) catb.fit(train, verbose_eval=100, #task_type = ""GPU"", plot=False) submission = pd.DataFrame(Xt.id) Xt_pool = Pool(data=Xt[list(X_train.columns)], feature_names=list(X_train.columns), cat_features=categorical_variables) submission['Action'] = catb.predict_proba(Xt_pool)[:,1] submission.to_csv(""catboost_submission.csv"", index=False) cat_boost_submission = submission.copy()'",Yes,3,7.0 "import tensorflow as tf from tensorflow.keras import backend as K from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense from tensorflow.keras.optimizers import Adam, Nadam from tensorflow.keras.layers import Input, Embedding, Reshape, GlobalAveragePooling1D from tensorflow.keras.layers import Flatten, concatenate, Concatenate, Lambda, Dropout, SpatialDropout1D from tensorflow.keras.layers import Reshape, MaxPooling1D,BatchNormalization, AveragePooling1D, Conv1D from tensorflow.keras.layers import Activation, LeakyReLU from tensorflow.keras.optimizers import SGD, Adam, Nadam from tensorflow.keras.models import Model, load_model from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau from tensorflow.keras.regularizers import l2, l1_l2 from keras.losses import binary_crossentropy from sklearn.metrics import roc_auc_score from sklearn.metrics import average_precision_score import matplotlib.pyplot as plt",No,5,22.0 "# Registering custom activations suitable for tabular problems from tensorflow.keras.utils import get_custom_objects from tensorflow.keras.layers import Activation, LeakyReLU from deep_learning_for_tabular_data.tabular import gelu, Mish, mish # Add gelu so we can use it as a string get_custom_objects().update({'gelu': Activation(gelu)}) # Add mish so we can use it as a string get_custom_objects().update({'mish': Mish(mish)}) # Add leaky-relu so we can use it as a string get_custom_objects().update({'leaky-relu': Activation(LeakyReLU(alpha=0.2))})",Yes,5,22.0 "# Parametric architecture def tabular_dnn(numeric_variables, categorical_variables, categorical_counts, feature_selection_dropout=0.2, categorical_dropout=0.1, first_dense = 256, second_dense = 256, dense_dropout = 0.2, activation_type=gelu): numerical_inputs = Input(shape=(len(numeric_variables),)) numerical_normalization = BatchNormalization()(numerical_inputs) numerical_feature_selection = Dropout(feature_selection_dropout)(numerical_normalization) categorical_inputs = [] categorical_embeddings = [] for category in categorical_variables: categorical_inputs.append(Input(shape=[1], name=category)) category_counts = categorical_counts[category] categorical_embeddings.append( Embedding(category_counts+1, int(np.log1p(category_counts)+1), name = category + ""_embed"")(categorical_inputs[-1])) categorical_logits = Concatenate(name = ""categorical_conc"")([Flatten()(SpatialDropout1D(categorical_dropout)(cat_emb)) for cat_emb in categorical_embeddings]) x = concatenate([numerical_feature_selection, categorical_logits]) x = Dense(first_dense, activation=activation_type)(x) x = Dropout(dense_dropout)(x) x = Dense(second_dense, activation=activation_type)(x) x = Dropout(dense_dropout)(x) output = Dense(1, activation=""sigmoid"")(x) model = Model([numerical_inputs] + categorical_inputs, output) return model",No,4,4.0 "# Useful functions from tensorflow.keras.metrics import AUC def mAP(y_true, y_pred): return tf.py_func(average_precision_score, (y_true, y_pred), tf.double) def compile_model(model, loss, metrics, optimizer): model.compile(loss=loss, metrics=metrics, optimizer=optimizer) return model def plot_keras_history(history, measures): """""" history: Keras training history measures = list of names of measures """""" rows = len(measures) // 2 + len(measures) % 2 fig, panels = plt.subplots(rows, 2, figsize=(15, 5)) plt.subplots_adjust(top = 0.99, bottom=0.01, hspace=0.4, wspace=0.2) try: panels = [item for sublist in panels for item in sublist] except: pass for k, measure in enumerate(measures): panel = panels[k] panel.set_title(measure + ' history') panel.plot(history.epoch, history.history[measure], label=""Train ""+measure) panel.plot(history.epoch, history.history[""val_""+measure], label=""Validation ""+measure) panel.set(xlabel='epochs', ylabel=measure) panel.legend() plt.show(fig)'",Yes,5,35.0 "# Global training settings SEED = 42 FOLDS = 5 BATCH_SIZE = 512",No,5,77.0 "from deep_learning_for_tabular_data.tabular import TabularTransformer, DataGenerator # Setting the CV strategy skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED) # CV Iteration roc_auc = list() average_precision = list() oof = np.zeros(len(X)) best_iteration = list() for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)): tb = TabularTransformer(numeric = numeric_variables, ordinal = [], lowcat = [], highcat = categorical_variables) tb.fit(X.iloc[train_idx]) sizes = tb.shape(X.iloc[train_idx]) categorical_levels = dict(zip(categorical_variables, sizes[1:])) print(f""Input array sizes: {sizes}"") print(f""Categorical levels: {categorical_levels}\ "") model = tabular_dnn(numeric_variables, categorical_variables, categorical_levels, feature_selection_dropout=0.1, categorical_dropout=0.1, first_dense = 256, second_dense = 256, dense_dropout = 0.1, activation_type=gelu) model = compile_model(model, binary_crossentropy, [AUC(name='auc'), mAP], Adam(learning_rate=0.0001)) train_batch = DataGenerator(X.iloc[train_idx], y[train_idx], tabular_transformer=tb, batch_size=BATCH_SIZE, shuffle=True) history = model.fit_generator(train_batch, validation_data=(tb.transform(X.iloc[test_idx]), y[test_idx]), epochs=30, callbacks=[model_checkpoint, early_stopping, reduce_learning], class_weight=[1.0, (np.sum(y==0) / np.sum(y==1))], verbose=1) print(""\ FOLD %i"" % fold) plot_keras_history(history, measures=['auc', 'loss']) best_iteration.append(np.argmax(history.history['val_auc']) + 1) preds = model.predict(tb.transform(X.iloc[test_idx]), verbose=1, batch_size=1024).flatten() oof[test_idx] = preds roc_auc.append(roc_auc_score(y_true=y[test_idx], y_score=preds)) average_precision.append(average_precision_score(y_true=y[test_idx], y_score=preds))'",Yes,2,48.0 "# We train on all the examples, using a rule of thumb for the number of iterations tb = TabularTransformer(numeric = numeric_variables, ordinal = [], lowcat = [], highcat = categorical_variables) tb.fit(X) sizes = tb.shape(X) categorical_levels = dict(zip(categorical_variables, sizes[1:])) print(f""Input array sizes: {sizes}"") print(f""Categorical levels: {categorical_levels}\ "") model = tabular_dnn(numeric_variables, categorical_variables, categorical_levels, feature_selection_dropout=0.1, categorical_dropout=0.1, first_dense = 256, second_dense = 256, dense_dropout = 0.1, activation_type=gelu) model = compile_model(model, binary_crossentropy, [AUC(name='auc'), mAP], Adam(learning_rate=0.0001)) train_batch = DataGenerator(X, y, tabular_transformer=tb, batch_size=BATCH_SIZE, shuffle=True) history = model.fit_generator(train_batch, epochs=int(np.median(best_iteration)), class_weight=[1.0, (np.sum(y==0) / np.sum(y==1))], verbose=1)'",Yes,2,8.0 "# Predicting and submission preds = model.predict(tb.transform(Xt[X.columns]), verbose=1, batch_size=1024).flatten() submission = pd.DataFrame(Xt.id) submission['Action'] = preds submission.to_csv(""tabular_dnn_submission.csv"", index=False) tabular_dnn_submission = submission.copy()'",Yes,5,25.0 "from scipy.stats import rankdata # We use normalized ranks because probabilities emissions from the two models may differ dnn_rank = rankdata(tabular_dnn_submission.Action, method='dense') / len(Xt) cat_rank = rankdata(cat_boost_submission.Action, method='dense') / len(Xt) submission = pd.DataFrame(Xt.id) submission['Action'] = 0.5 * dnn_rank + 0.5 * cat_rank submission.to_csv(""blended_submission.csv"", index=False)'",Yes,5,25.0 "import matplotlib.pyplot as plt import seaborn as sns import numpy as np import pandas as pd import warnings warnings.filterwarnings(""ignore"")",No,5,23.0 "data = pd.read_csv('../input/amazon-employee-access-challenge/train.csv') print(data.shape) data.head()",Yes,4,45.0 data_explore = data.copy(),No,4,12.0 data_explore.info(),No,5,40.0 "sns.countplot(x='ACTION', data=data_explore)",No,5,33.0 "data_explore_role_dept = data_explore[['ROLE_DEPTNAME', ""ACTION""]].groupby(by='ROLE_DEPTNAME').count() data_explore_role_dept.sort_values('ACTION', ascending=False).head(n=15).transpose()'",No,2,40.0 "data_explore_role_codes = data_explore[['ROLE_CODE', ""ACTION""]].groupby(by='ROLE_CODE').count() data_explore_role_codes.sort_values('ACTION', ascending=False).head(n=15).transpose()'",Yes,3,12.0 "data_explore_role_family = data_explore[['ROLE_FAMILY', ""ACTION""]].groupby(by='ROLE_FAMILY').count() data_explore_role_family.sort_values('ACTION', ascending=False).head(n=15).transpose()'",Yes,3,12.0 "plt.figure(figsize=(12, 7)) corr_matrix = data_explore.corr() sns.heatmap(corr_matrix, mask=np.zeros_like(corr_matrix, dtype=np.bool), square=True, annot=True, cbar=False) plt.tight_layout()",No,5,80.0 corr_matrix['ACTION'].sort_values(ascending=False),No,2,40.0 "from sklearn.impute import SimpleImputer from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer",No,5,22.0 "X = data.drop(columns=['ACTION'], axis=1).copy() y = data['ACTION'].copy() X.shape, y.shape",No,3,21.0 "from sklearn.model_selection import StratifiedShuffleSplit split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) for train_index, test_index in split.split(X, y): strat_train_set = data.iloc[train_index] strat_test_set = data.iloc[test_index] X_train = strat_train_set.drop('ACTION', axis=1) y_train = strat_train_set['ACTION'].copy() X_test = strat_test_set.drop('ACTION', axis=1) y_test = strat_test_set['ACTION'].copy() X_train.shape, X_test.shape",Yes,4,13.0 "from sklearn.model_selection import KFold, cross_val_score kf = KFold(n_splits=5, shuffle=True, random_state=42)",Yes,5,84.0 "from sklearn.metrics import matthews_corrcoef, make_scorer, roc_auc_score, roc_curve Matthew = make_scorer(matthews_corrcoef) results = [] def plot_custom_roc_curve(clf_name, y_true, y_scores): auc_score = np.round(roc_auc_score(y_true, y_scores), 3) fpr, tpr, thresholds = roc_curve(y_true, y_scores) plt.plot(fpr, tpr, linewidth=2, label=clf_name+"" (AUC Score: {})"".format(str(auc_score))) plt.plot([0, 1], [0, 1], 'k--') # Dashed diagonal plt.axis([0, 1, 0, 1]) plt.xlabel(""FPR"", fontsize=16) plt.ylabel(""TPR"", fontsize=16) plt.legend() def performance_measures(model, X_tr=X_train_transformed, y_tr=y_train, X_ts=X_test_transformed, y_ts=y_test, store_results=True): train_mcc = cross_val_score(model, X_tr, y_tr, scoring=Matthew, cv=kf, n_jobs=-1) test_mcc = cross_val_score(model, X_ts, y_ts, scoring=Matthew, cv=kf, n_jobs=-1) print(""Mean Train MCC: {}\ Mean Test MCC: {}"".format(train_mcc.mean(), test_mcc.mean())) train_roc_auc = cross_val_score(model, X_tr, y_tr, scoring='roc_auc', cv=kf, n_jobs=-1) test_roc_auc = cross_val_score(model, X_ts, y_ts, scoring='roc_auc', cv=kf, n_jobs=-1) print(""Mean Train ROC AUC Score: {}\ Mean Test ROC AUC Score: {}"".format(train_roc_auc.mean(), test_roc_auc.mean())) if store_results: results.append([model.__class__.__name__, np.round(np.mean(train_roc_auc), 3), np.round(np.mean(test_roc_auc), 3), np.round(np.mean(train_mcc), 3), np.round(np.mean(test_mcc), 3)])'",Yes,5,84.0 "def plot_feature_importance(feature_columns, importance_values, top_n_features=10): feature_imp = [ col for col in zip(feature_columns, importance_values)] feature_imp.sort(key=lambda x:x[1], reverse=True) if top_n_features: imp = pd.DataFrame(feature_imp[0:top_n_features], columns=['feature', 'importance']) else: imp = pd.DataFrame(feature_imp, columns=['feature', 'importance']) plt.figure(figsize=(10, 8)) sns.barplot(y='feature', x='importance', data=imp, orient='h') plt.title('Most Important Features', fontsize=16) plt.ylabel(""Feature"", fontsize=16) plt.xlabel("""") plt.show()'",No,5,79.0 "from sklearn.linear_model import LogisticRegression logistic_reg = LogisticRegression(solver='liblinear', C=1, penalty='l2', max_iter=1000, random_state=42, n_jobs=-1) logistic_reg.fit(X_train_transformed, y_train)",Yes,5,7.0 "plot_feature_importance(feature_columns, logistic_reg.coef_[0], top_n_features=15)",No,5,79.0 performance_measures(logistic_reg),No,4,35.0 "from sklearn.ensemble import RandomForestClassifier forest_clf = RandomForestClassifier(n_estimators=300, max_depth=16, random_state=42,n_jobs=-1) forest_clf.fit(X_train_transformed, y_train)",Yes,5,7.0 "plot_feature_importance(feature_columns, forest_clf.feature_importances_, top_n_features=15)",No,5,79.0 performance_measures(forest_clf),No,4,35.0 "from xgboost import XGBClassifier xgb_clf = XGBClassifier(n_estimators=300, max_depth=16, learning_rate=0.1, random_state=42, n_jobs=-1) xgb_clf.fit(X_train_transformed, y_train)",Yes,5,7.0 "plot_feature_importance(feature_columns, xgb_clf.feature_importances_, top_n_features=15)",No,5,79.0 performance_measures(xgb_clf),No,4,35.0 "from catboost import CatBoostClassifier catboost_clf = CatBoostClassifier(loss_function='Logloss', iterations=500, depth=6, l2_leaf_reg=1, cat_features=list(range(X_cb_train_transformed.shape[1])), eval_metric='AUC', random_state=42, verbose=0) catboost_clf.fit(X_cb_train_transformed, y_train)",Yes,5,7.0 "performance_measures(catboost_clf, X_tr=X_cb_train_transformed, X_ts=X_cb_test_transformed)",No,5,53.0 "plot_feature_importance(feature_columns, catboost_clf.feature_importances_, top_n_features=15)",No,5,79.0 "from sklearn.ensemble import VotingClassifier voting_reg = VotingClassifier(estimators=named_estimators, voting='soft', n_jobs=-1) voting_reg.fit(X_train, y_train)",Yes,5,7.0 "performance_measures(voting_reg, X_tr=X_train, X_ts=X_test)",No,4,35.0 "result_df = pd.DataFrame(results, columns=['Model', 'CV Train AUC Score', 'CV Test AUC Score', 'CV Train MCC', 'CV Test MCC']) result_df",No,5,12.0 "plt.figure(figsize=(8, 5)) plot_custom_roc_curve('Logistic Regression', y_test, logistic_reg.decision_function(X_test_transformed)) plot_custom_roc_curve('Random Forest', y_test, forest_clf.predict_proba(X_test_transformed)[:,1]) plot_custom_roc_curve('XGBoost', y_test, xgb_clf.predict_proba(X_test_transformed)[:,1]) plot_custom_roc_curve('CatBoost', y_test, catboost_clf.predict_proba(X_cb_test_transformed)[:,1]) plot_custom_roc_curve('Soft Voting', y_test, voting_reg.predict_proba(X_test)[:,1]) plt.show()",No,5,53.0 "final_model = Pipeline([('pre_process', cat_boost_pre_process), ('catboost', catboost_clf)]) final_model.fit(X_train, y_train)",Yes,5,7.0 "test_data = pd.read_csv('../input/amazon-employee-access-challenge/test.csv') test_data.head()",No,4,45.0 "output = pd.DataFrame(test_data['id']) test_data = test_data.drop('id', axis=1)",Yes,4,12.0 predictions = final_model.predict(test_data),No,5,48.0 output['ACTION'] = predictions.copy(),No,3,12.0 "output.to_csv(""./submission.csv"", index=False)",No,5,25.0 "MAX_WEIGHT = 50.0 toys = { ""horse"": { ""sample"": lambda: max(0, np.random.normal(5,2,1)[0]), ""sample_type"": ""normal(5,2)"" }, ""ball"": { ""sample"": lambda: max(0, 1 + np.random.normal(1,0.3,1)[0]), ""sample_type"": ""normal(1,0.3)"" }, ""bike"": { ""sample"": lambda: max(0, np.random.normal(20,10,1)[0]), ""sample_type"": ""normal(20,10)"" }, ""train"": { ""sample"": lambda: max(0, np.random.normal(10,5,1)[0]), ""sample_type"": ""normal(10,5)"" }, ""coal"": { ""sample"": lambda: 47 * np.random.beta(0.5,0.5,1)[0], ""sample_type"": ""47*beta(0.5,0.5)"" }, ""book"": { ""sample"": lambda: np.random.chisquare(2,1)[0], ""sample_type"": ""chi(2)"" }, ""doll"": { ""sample"": lambda: np.random.gamma(5,1,1)[0], ""sample_type"": ""gamma(5,1)"" }, ""block"": { ""sample"": lambda: np.random.triangular(5,10,20,1)[0], ""sample_type"": ""triagl(5,10,20)"" }, ""gloves"": { ""sample"": lambda: 3.0 + np.random.rand(1)[0] if np.random.rand(1) < 0.3 else np.random.rand(1)[0], ""sample_type"": ""0.3:3+rand(1), 0.7:rand(1)"" }, } toy_names = list(toys) gifts_df = pd.read_csv(""../input/gifts.csv"", sep="","") gifts = gifts_df[""GiftId""].values print(""{} gifts"".format(len(gifts))) for t in toys: # get ranges samples = [toys[t][""sample""]() for _ in range(1000)] toys[t][""max""] = max(samples) toys[t][""min""] = min(samples) # get gift counts ids = [g for g in gifts if t in g.split(""_"")[0]] toys[t][""ids""] = ids toys[t][""count""] = len(ids) # print toy type stats print(""{:4}\\tdist: {:26}\\trange:{:5.2f} - {:5.2f}\\tcount:{:6,}"".format(t, toys[t][""sample_type""], toys[t][""min""], toys[t][""max""], toys[t][""count""]))'",Yes,3,8.0 "X_test['Store'].nunique(), X_test['Dept'].nunique(), X_test['Date'].nunique()",No,5,54.0 "X_test.shape, X_train.shape",No,5,58.0 "X_train['Year'] = pd.DatetimeIndex(X_train['Date']).year X_train['Month'] = pd.DatetimeIndex(X_train['Date']).month X_train['woy'] = pd.DatetimeIndex(X_train['Date']).weekofyear X_train['quarter'] = pd.DatetimeIndex(X_train['Date']).quarter X_test['Year'] = pd.DatetimeIndex(X_test['Date']).year X_test['Month'] = pd.DatetimeIndex(X_test['Date']).month X_test['woy'] = pd.DatetimeIndex(X_test['Date']).weekofyear X_test['quarter'] = pd.DatetimeIndex(X_test['Date']).quarter ## for future reference ## # df['dow'] = df.index.dayofweek # df['doy'] = df.index.dayofyear",No,5,8.0 X_all['Store'].unique(),No,5,57.0 "cols_num = [col for col in X_train.columns if X_train[col].dtype in [float, int]] ncols = len(cols_num) // 4 fig, axes = plt.subplots(ncols=ncols, nrows=5, figsize=(30,16)) i = 1 for j, col in enumerate(cols_num): sns.distplot(X_train[col], bins=10, ax=axes[i-1][j % ncols]) if j % ncols == (ncols - 1): i += 1 plt.tight_layout()",No,5,33.0 "X_train_tf['Date2'] = pd.to_datetime(X_train_tf['Date'], utc = True) X_test['Date2'] = pd.to_datetime(X_test['Date'], utc = True) X_train_tf['Weekly_Sales_tf_Lag_52_Weeks'] = X_train_tf.merge(X_all, left_on=['Store', 'Dept', 'Date2'], right_on=['Store', 'Dept', 'Date2'], how='inner')['Weekly_Sales_tf_Lag_52_Weeks'] X_test['Weekly_Sales_tf_Lag_52_Weeks'] = X_test.merge(X_all, left_on=['Store', 'Dept', 'Date2'], right_on=['Store', 'Dept', 'Date2'], how='inner')['Weekly_Sales_tf_Lag_52_Weeks'] X_train_tf.drop(['Date2', 'outlier'], axis=1, inplace=True) X_test.drop('Date2', axis=1, inplace=True) X_test.isna().sum()",No,3,32.0 "b""fig, axes = plt.subplots(ncols=5, figsize=(20,8))\nsns.distplot(X_train['Weekly_Sales'], bins=10, ax=axes[0]).set_title('Weekly Sales')\nsns.distplot(X_train['Weekly_Sales_Log'], bins=10, ax=axes[1]).set_title('Log(1+Weekly Sales)')\nsns.distplot(X_train_tf['Weekly_Sales_Log'], bins=10, ax=axes[2]).set_title('Log(1+Weekly Sales)\\nno outliers')\nsns.distplot(X_train['Weekly_Sales_tf'], bins=10, ax=axes[3]).set_title('(1+Weekly Sales)\\nQauntile Transformer')\nsns.distplot(X_train_tf['Weekly_Sales_tf'], bins=10, ax=axes[4]).set_title('(1+Weekly Sales)\\nQauntile Transformer\\nno outliers')\n\nplt.tight_layout()""",No,5,33.0 "print(X_train_tf.shape) X_train_tf.head()",No,4,41.0 "b""print(X_train_tf['Type'].value_counts(), '\\n', X_test['Type'].value_counts())\nprint(X_train_tf['IsHoliday'].value_counts(), '\\n', X_test['IsHoliday'].value_counts())""",No,5,72.0 "lbl_encoder = LabelEncoder() X_train_tf['IsHoliday'] = X_train_tf['IsHoliday'].replace(True, 5).replace(False, 1).values # go off the custom weighted-mae function X_train_tf['Type'] = lbl_encoder.fit_transform(X_train_tf['Type']) X_test['IsHoliday'] = X_test['IsHoliday'].replace(True, 5).replace(False, 1).values # go off the custom weighted-mae function X_test['Type'] = lbl_encoder.transform(X_test['Type'])",No,5,20.0 X_train_tf.head(),No,5,41.0 "def weighted_mae_custom(y_true, y_pred): ''' Custom weighting function as specified in the evaluation section. ''' weights = X_train_tf['IsHoliday'] sample_weights = pd.Series(weights.loc[y_true.index.values].values.reshape(-1)).dropna() return (1.0 / np.sum(sample_weights)) * np.sum(sample_weights * np.abs(y_true - y_pred)) weighted_mae = make_scorer(weighted_mae_custom)",No,5,84.0 X_train_tf.dtypes,No,5,70.0 "X_test.drop('Date', axis=1).isna().sum()",No,5,39.0 "## QuantileTransformer ## X_test['Weekly_Sales'] = qt.inverse_transform(best_models[2].predict(X_test.drop('Date', axis=1)).reshape(-1, 1)) + 1 X_test.head()",No,3,8.0 "import pandas as pd import os import numpy as np from sklearn.preprocessing import LabelEncoder print(os.listdir('../input')) file = ['train_users.csv', 'age_gender_bkts.csv', 'sessions.csv', 'countries.csv', 'test_users.csv'] data = {} for f in file: data[f.replace('.csv','')]=pd.read_csv('../input/'+f) train = data['train_users'] test = data['test_users'] # train = train.fillna(-100) # test = test.fillna(-100) age = data['age_gender_bkts'] sessions = data['sessions'] country = data['countries'] target = train['country_destination'] train = train.drop(['country_destination'],axis=1) ",No,3,22.0 "# temp = pd.DataFrame(train.apply(lambda row: isinstance(row['date_first_booking'], float), axis = 1)) # temp['destination'] = (target == 'NDF') # temp['comparison'] = temp.apply(lambda x: x[0] != x['destination'], axis = 1) # temp.apply(sum)",No,4,8.0 "# Imports # pandas import pandas as pd from pandas import Series,DataFrame # numpy, matplotlib, seaborn import numpy as np import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline # machine learning from sklearn.linear_model import LogisticRegression, RandomizedLasso from sklearn.ensemble import GradientBoostingClassifier from sklearn.cross_validation import train_test_split from sklearn import metrics from sklearn import cross_validation from sklearn.preprocessing import OneHotEncoder, Imputer, LabelEncoder from sklearn.learning_curve import learning_curve from sklearn.grid_search import GridSearchCV from sklearn.pipeline import Pipeline",No,5,23.0 " plot_params = { ""font.size"":14.0, ""figure.figsize"": (6, 4), ""axes.labelsize"": ""large"", ""figure.autolayout"": True, ""patch.edgecolor"":""white"", ""axes.facecolor"": ""#f0f0f0"", ""patch.edgecolor"": ""#f0f0f0"", ""figure.facecolor"": ""#f0f0f0"", ""grid.linestyle"": ""-"", ""grid.linewidth"": 1.0, ""grid.color"": ""#cbcbcb"", ""savefig.edgecolor"": ""#f0f0f0"", ""savefig.facecolor"": ""#f0f0f0"" } sns.set(rc=plot_params)",No,5,23.0 "# get airbnb & test csv files as a DataFrame airbnb_df = pd.read_csv('../input/train_users.csv') test_df = pd.read_csv('../input/test_users.csv') # preview the data airbnb_df.head()",No,4,45.0 "# drop unnecessary columns, these columns won't be useful in analysis and prediction airbnb_df = airbnb_df.drop(['date_account_created','timestamp_first_active'], axis=1) test_df = test_df.drop(['date_account_created','timestamp_first_active'], axis=1)",No,5,10.0 "airbnb_df['booked'] = (airbnb_df['country_destination'] != 'NDF').astype(int) gp = airbnb_df[['id','country_destination']].groupby('country_destination').count() ax = gp.sort('id').plot(kind='barh', color=['#0059b3']) ax.set_xlabel('# of bookings') ax.set_ylabel('country destination') ax.legend_.remove()",No,5,81.0 "# date_first_booking def get_year(date): if date == date: return int(str(date)[:4]) return date def get_month(date): if date == date: return int(str(date)[5:7]) return date # Create Year and Month columns airbnb_df['Year'] = airbnb_df['date_first_booking'].apply(get_year) airbnb_df['Month'] = airbnb_df['date_first_booking'].apply(get_month) test_df['Year'] = test_df['date_first_booking'].apply(get_year) test_df['Month'] = test_df['date_first_booking'].apply(get_month) # fill NaN airbnb_df['Year'].fillna(airbnb_df['Year'].median(), inplace=True) airbnb_df['Month'].fillna(airbnb_df['Month'].median(), inplace=True) test_df['Year'].fillna(test_df['Year'].median(), inplace=True) test_df['Month'].fillna(test_df['Month'].median(), inplace=True) # convert type to integer airbnb_df[['Year', 'Month']] = airbnb_df[['Year', 'Month']].astype(int) test_df[['Year', 'Month']] = test_df[['Year', 'Month']].astype(int) ",No,3,8.0 "# age

# assign all age values > 100 to NaN, these NaN values will be replaced with real ages below
airbnb_df[""age""][airbnb_df[""age""] > 100] = np.NaN
test_df[""age""][test_df[""age""] > 100] = np.NaN

# get average, std, and number of NaN values in airbnb_df
average_age_airbnb = airbnb_df[""age""].mean()
std_age_airbnb = airbnb_df[""age""].std()
count_nan_age_airbnb = airbnb_df[""age""].isnull().sum()

# get average, std, and number of NaN values in test_df
average_age_test = test_df[""age""].mean()
std_age_test = test_df[""age""].std()
count_nan_age_test = test_df[""age""].isnull().sum()

# generate random numbers between (mean - std) & (mean + std)
rand_1 = np.random.randint(average_age_airbnb - std_age_airbnb, average_age_airbnb + std_age_airbnb, size = count_nan_age_airbnb)
rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)

# fill NaN values in Age column with random values generated
airbnb_df[""age""][np.isnan(airbnb_df[""age""])] = rand_1
test_df[""age""][np.isnan(test_df[""age""])] = rand_2

# convert type to integer
airbnb_df[\'age\'] = airbnb_df[\'age\'].astype(int)
test_df[\'age\'] = test_df[\'age\'].astype(int)",No,3,17.0 "# signup_method
airbnb_df[""signup_method""] = (airbnb_df[""signup_method""] == ""basic"").astype(int)
test_df[""signup_method""] = (test_df[""signup_method""] == ""basic"").astype(int)

# signup_flow
airbnb_df[""signup_flow""] = (airbnb_df[""signup_flow""] == 3).astype(int)
test_df[""signup_flow""] = (test_df[""signup_flow""] == 3).astype(int)

# language
airbnb_df[""language""] = (airbnb_df[""language""] == \'en\').astype(int)
test_df[""language""] = (test_df[""language""] == \'en\').astype(int)

# affiliate_channel
airbnb_df[""affiliate_channel""] = (airbnb_df[""affiliate_channel""] == \'direct\').astype(int)
test_df[""affiliate_channel""] = (test_df[""affiliate_channel""] == \'direct\').astype(int)

# affiliate_provider
airbnb_df[""affiliate_provider""] = (airbnb_df[""affiliate_provider""] == \'direct\').astype(int)
test_df[""affiliate_provider""] = (test_df[""affiliate_provider""] == \'direct\').astype(int)
",No,5,16.0 "for f in airbnb_df.columns:
if f == ""country_destination"" or f == ""id"": continue
if airbnb_df[f].dtype == \'object\':
lbl = LabelEncoder()
lbl.fit(np.unique(list(airbnb_df[f].values) + list(test_df[f].values)))
airbnb_df[f] = lbl.transform(list(airbnb_df[f].values))
test_df[f] = lbl.transform(list(test_df[f].values))",No,5,20.0 "X = airbnb_df.drop([""country_destination"", ""id"", \'booked\'],axis=1)
y = airbnb_df[""country_destination""]
test = test_df.drop(""id"",axis=1).copy()",No,4,21.0 "# modify country_destination to numerical values country_num_dic = {'NDF': 0, 'US': 1, 'other': 2, 'FR': 3, 'IT': 4, 'GB': 5, 'ES': 6, 'CA': 7, 'DE': 8, 'NL': 9, 'AU': 10, 'PT': 11} num_country_dic = {y:x for x,y in country_num_dic.items()} y = y.map(country_num_dic)",No,5,20.0 "# convert type to integer ypred = ypred.astype(int) # change values back to original country symbols ypred = Series(ypred).map(num_country_dic)",No,4,16.0 "
# Create submission

country_df = pd.DataFrame({
""id"": test_df[""id""],
""country"": ypred
})

submission = DataFrame(columns=[""id"", ""country""])

# sort countries according to most probable destination country
for key in country_df[\'country\'].value_counts().index:
submission = pd.concat([submission, country_df[country_df[""country""] == key]], ignore_index=True)

submission.to_csv(\'airbnb.csv\', index=False)",No,5,25.0 "from pandas import Series,DataFrame
import pandas as pd

# numpy, matplotlib, seaborn
import numpy as np

names=[
\'Field6\',
\'Field7\',
\'Field8\',
\'Field10\',
\'CoverageField2A\',
\'CoverageField2B\',
\'CoverageField3A\',
\'CoverageField4B\',
\'CoverageField5A\',
\'CoverageField5B\',
\'CoverageField6A\',
\'CoverageField6B\',
\'CoverageField8\',
\'CoverageField11A\',
\'CoverageField11B\',
\'SalesField1A\',
\'SalesField1B\',
\'SalesField2A\',
\'SalesField2B\',
\'SalesField3\',
\'SalesField4\',
\'SalesField6\',
\'SalesField7\',
\'SalesField8\',
\'SalesField9\',
\'SalesField10\',
\'SalesField12\',
\'SalesField13\',
\'SalesField14\',
\'SalesField15\',
\'PersonalField1\',
\'PersonalField2\',
\'PersonalField4B\',
\'PersonalField5\',
\'PersonalField6\',
\'PersonalField7\',
\'PersonalField8\',
\'PersonalField9\',
\'PersonalField10A\',
\'PersonalField10B\',
\'PersonalField11\',
\'PersonalField12\',
\'PersonalField13\',
\'PersonalField15\',
\'PersonalField16\',
\'PersonalField17\',
\'PersonalField18\',
\'PersonalField19\',
\'PersonalField22\',
\'PersonalField23\',
\'PersonalField25\',
\'PersonalField27\',
\'PersonalField29\',
\'PersonalField33\',
\'PersonalField34\',
\'PersonalField36\',
\'PersonalField37\',
\'PersonalField38\',
\'PersonalField39\',
\'PersonalField40\',
\'PersonalField41\',
\'PersonalField42\',
\'PersonalField47\',
\'PersonalField48\',
\'PersonalField49\',
\'PersonalField50\',
\'PersonalField51\',
\'PersonalField52\',
\'PersonalField53\',
\'PersonalField56\',
\'PersonalField57\',
\'PersonalField59\',
\'PersonalField60\',
\'PersonalField62\',
\'PersonalField63\',
\'PersonalField64\',
\'PersonalField66\',
\'PersonalField69\',
\'PersonalField70\',
\'PersonalField71\',
\'PersonalField74\',
\'PersonalField75\',
\'PersonalField77\',
\'PersonalField81\',
\'PersonalField82\',
\'PersonalField83\',
\'PersonalField84\',
\'PropertyField1A\',
\'PropertyField1B\',
\'PropertyField2A\',
\'PropertyField2B\',
\'PropertyField3\',
\'PropertyField4\',
\'PropertyField6\',
\'PropertyField7\',
\'PropertyField8\',
\'PropertyField9\',
\'PropertyField10\',
\'PropertyField11B\',
\'PropertyField12\',
\'PropertyField13\',
\'PropertyField14\',
\'PropertyField15\',
\'PropertyField16B\',
\'PropertyField18\',
\'PropertyField19\',
\'PropertyField20\',
\'PropertyField21B\',
\'PropertyField22\',
\'PropertyField23\',
\'PropertyField24B\',
\'PropertyField25\',
\'PropertyField26A\',
\'PropertyField26B\',
\'PropertyField27\',
\'PropertyField28\',
\'PropertyField29\',
\'PropertyField30\',
\'PropertyField31\',
\'PropertyField32\',
\'PropertyField33\',
\'PropertyField34\',
\'PropertyField35\',
\'PropertyField36\',
\'PropertyField37\',
\'PropertyField38\',
\'PropertyField39A\',
\'GeographicField1A\',
\'GeographicField2B\',
\'GeographicField4A\',
\'GeographicField4B\',
\'GeographicField5A\',
\'GeographicField6A\',
\'GeographicField8A\',
\'GeographicField11A\',
\'GeographicField13B\',
\'GeographicField15A\',
\'GeographicField16B\',
\'GeographicField17A\',
\'GeographicField17B\',
\'GeographicField18A\',
\'GeographicField20B\',
\'GeographicField21B\',
\'GeographicField22A\',
\'GeographicField22B\',
\'GeographicField23A\',
\'GeographicField23B\',
\'GeographicField24A\',
\'GeographicField26A\',
\'GeographicField27A\',
\'GeographicField29B\',
\'GeographicField30B\',
\'GeographicField32A\',
\'GeographicField33B\',
\'GeographicField36B\',
\'GeographicField37B\',
\'GeographicField38A\',
\'GeographicField39B\',
\'GeographicField41A\',
\'GeographicField41B\',
\'GeographicField42B\',
\'GeographicField43A\',
\'GeographicField44A\',
\'GeographicField45A\',
\'GeographicField45B\',
\'GeographicField46B\',
\'GeographicField48A\',
\'GeographicField48B\',
\'GeographicField50B\',
\'GeographicField52B\',
\'GeographicField53B\',
\'GeographicField54B\',
\'GeographicField55B\',
\'GeographicField56A\',
\'GeographicField59A\',
\'GeographicField59B\',
\'GeographicField60A\',
\'GeographicField60B\',
\'GeographicField61A\',
\'GeographicField61B\',
\'GeographicField62A\',
\'GeographicField62B\',
\'GeographicField63\',
\'Year\',
\'Month\'
]


import random
from datetime import datetime
import pandas as pd
from pandas import DataFrame as df
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import train_test_split,cross_val_score
from sklearn import preprocessing

train = pd.read_csv(\'../input/train.csv\')
test = pd.read_csv(\'../input/test.csv\')

train_sample = np.random.choice(train.index.values,130000)
train = train.ix[train_sample]

# Converting date into datetime format
train[\'Date\'] = pd.to_datetime(pd.Series(train[\'Original_Quote_Date\']))
# Dropping original date column
train = train.drop(\'Original_Quote_Date\', axis=1)

test[\'Date\'] = pd.to_datetime(pd.Series(test[\'Original_Quote_Date\']))
test = test.drop(\'Original_Quote_Date\', axis=1)

## Seperating date into 3 columns
train[\'Year\'] = train[\'Date\'].apply(lambda x: int(str(x)[:4]))
train[\'Month\'] = train[\'Date\'].apply(lambda x: int(str(x)[5:7]))
train[\'weekday\'] = train[\'Date\'].dt.dayofweek

test[\'Year\'] = test[\'Date\'].apply(lambda x: int(str(x)[:4]))
test[\'Month\'] = test[\'Date\'].apply(lambda x: int(str(x)[5:7]))
test[\'weekday\'] = test[\'Date\'].dt.dayofweek

train = train.drop(\'Date\', axis=1)
test = test.drop(\'Date\', axis=1)

## Filing NA values with -1

train = train.fillna(-1)
test = test.fillna(-1)
test_ori=test

y = train.QuoteConversion_Flag.values

#columns choice--gmm
train=DataFrame(train,columns=names)
test=DataFrame(test,columns=names)

for f in train.columns:
if train[f].dtype==\'object\':
print(f)
lbl=preprocessing.LabelEncoder()
lbl.fit(list(train[f].values)+list(test[f].values))
train[f]=lbl.transform(list(train[f].values))
test[f]=lbl.transform(list(test[f].values))

import xgboost as xgb

X_train = train
Y_train =y
X_test = test

params = {""objective"": ""binary:logistic""}
T_train_xgb = xgb.DMatrix(X_train, Y_train)
X_test_xgb = xgb.DMatrix(X_test)
gbm = xgb.train(params, T_train_xgb, 20)
Y_pred = gbm.predict(X_test_xgb)
# Create submission
submission = pd.DataFrame()
submission[""QuoteNumber""] = test_ori[""QuoteNumber""]
submission[""QuoteConversion_Flag""] = Y_pred
submission.to_csv(\'homesite.csv\', index=False)
























",No,2,23.0 "import pandas as pd import numpy as np",No,5,22.0 "df_train_users = pd.read_csv('../input/train_users_2.csv') df_test_users = pd.read_csv('../input/test_users.csv')",No,5,45.0 "# let\'s look at the destinations accounted for each occurence in the train set
df_train_users.groupby(""country_destination"").count()[""id""]",No,5,60.0 "# the 5 most frequent ""destinations"" are [""NDF"",""US"",""other"",""FR"",""IT""]
# baseline: predict [""NDF"",""US"",""other"",""FR"",""IT""] for each user in the test set

res = [[x, destination] for x in df_test_users[""id""] for destination in [""NDF"",""US"",""other"",""FR"",""IT""]]
sub_baseline = pd.DataFrame(np.array(res), columns=[\'id\', \'country\'])
sub_baseline.to_csv(\'sub_baseline.csv\', index=False)",No,5,25.0 "import numpy as np
import pandas as pd
import math
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler


# https://github.com/sublee/elo/blob/master/elo.py
""""""
elo
~~~
The Elo rating system.
:copyright: (c) 2012 by Heungsub Lee
:license: BSD, see LICENSE for more details.
""""""
from datetime import datetime
import inspect

__version__ = \'0.1.dev\'
__all__ = [\'Elo\', \'Rating\', \'CountedRating\', \'TimedRating\', \'rate\', \'adjust\',
\'expect\', \'rate_1vs1\', \'adjust_1vs1\', \'quality_1vs1\', \'setup\',
\'global_env\', \'WIN\', \'DRAW\', \'LOSS\', \'K_FACTOR\', \'RATING_CLASS\',
\'INITIAL\', \'BETA\']


#: The actual score for win.
WIN = 1.01
#: The actual score for draw.
DRAW = 0.5
#: The actual score for loss.
LOSS = 0.

#: Default K-factor.
K_FACTOR = 10
#: Default rating class.
RATING_CLASS = float
#: Default initial rating.
INITIAL = 1300
#: Default Beta value.
BETA = 170


class Rating(object):

try:
__metaclass__ = __import__(\'abc\').ABCMeta
except ImportError:
# for Python 2.5
pass

value = None

def __init__(self, value=None):
if value is None:
value = global_env().initial
self.value = value

def rated(self, value):
""""""Creates a :class:Rating object for the recalculated rating.
:param value: the recalculated rating value.
""""""
return type(self)(value)

def __int__(self):
""""""Type-casting to int.""""""
return int(self.value)

def __long__(self):
""""""Type-casting to long.""""""
return long(self.value)

def __float__(self):
""""""Type-casting to float.""""""
return float(self.value)

def __nonzero__(self):
""""""Type-casting to bool.""""""
return bool(int(self))

def __eq__(self, other):
return float(self) == float(other)

def __lt__(self, other):
""""""Is Rating < number.
:param other: the operand
:type other: number
""""""
return self.value < other

def __le__(self, other):
""""""Is Rating <= number.
:param other: the operand
:type other: number
""""""
return self.value <= other

def __gt__(self, other):
""""""Is Rating > number.
:param other: the operand
:type other: number
""""""
return self.value > other

def __ge__(self, other):
""""""Is Rating >= number.
:param other: the operand
:type other: number
""""""
return self.value >= other

def __iadd__(self, other):
""""""Rating += number.
:param other: the operand
:type other: number
""""""
self.value += other
return self

def __isub__(self, other):
""""""Rating -= number.
:param other: the operand
:type other: number
""""""
self.value -= other
return self

def __repr__(self):
c = type(self)
ext_params = inspect.getargspec(c.__init__)[0][2:]
kwargs = \', \'.join(\'%s=%r\' % (param, getattr(self, param))
for param in ext_params)
if kwargs:
kwargs = \', \' + kwargs
args = (\'.\'.join([c.__module__, c.__name__]), self.value, kwargs)
return \'%s(%.3f%s)\' % args


try:
Rating.register(float)
except AttributeError:
pass


class CountedRating(Rating):
""""""Increases count each rating recalculation.""""""

times = None

def __init__(self, value=None, times=0):
self.times = times
super(CountedRating, self).__init__(value)

def rated(self, value):
rated = super(CountedRating, self).rated(value)
rated.times = self.times + 1
return rated


class TimedRating(Rating):
""""""Writes the final rated time.""""""

rated_at = None

def __init__(self, value=None, rated_at=None):
self.rated_at = rated_at
super(TimedRating, self).__init__(value)

def rated(self, value):
rated = super(TimedRating, self).rated(value)
rated.rated_at = datetime.utcnow()
return rated


class Elo(object):

def __init__(self, k_factor=K_FACTOR, rating_class=RATING_CLASS,
initial=INITIAL, beta=BETA):
self.k_factor = k_factor
self.rating_class = rating_class
self.initial = initial
self.beta = beta

def expect(self, rating, other_rating):
""""""The ""E"" function in Elo. It calculates the expected score of the
first rating by the second rating.
""""""
# http://www.chess-mind.com/en/elo-system
diff = float(other_rating) - float(rating)
f_factor = 2 * self.beta # rating disparity
return 1. / (1 + 11 ** (diff / f_factor))

def adjust(self, rating, series):
""""""Calculates the adjustment value.""""""
return sum(score - self.expect(rating, other_rating)
for score, other_rating in series)

def rate(self, rating, series):
""""""Calculates new ratings by the game result series.""""""
rating = self.ensure_rating(rating)
k = self.k_factor(rating) if callable(self.k_factor) else self.k_factor
new_rating = float(rating) + k * self.adjust(rating, series)
if hasattr(rating, \'rated\'):
new_rating = rating.rated(new_rating)
return new_rating

def adjust_1vs1(self, rating1, rating2, drawn=False):
return self.adjust(rating1, [(DRAW if drawn else WIN, rating2)])

def rate_1vs1(self, rating1, rating2, drawn=False):
scores = (DRAW, DRAW) if drawn else (WIN, LOSS)
return (self.rate(rating1, [(scores[0], rating2)]),
self.rate(rating2, [(scores[1], rating1)]))

def quality_1vs1(self, rating1, rating2):
return 2 * (0.5 - abs(0.5 - self.expect(rating1, rating2)))

def create_rating(self, value=None, *args, **kwargs):
if value is None:
value = self.initial
return self.rating_class(value, *args, **kwargs)

def ensure_rating(self, rating):
if isinstance(rating, self.rating_class):
return rating
return self.rating_class(rating)

def make_as_global(self):
""""""Registers the environment as the global environment.
>>> env = Elo(initial=2000)
>>> Rating()
elo.Rating(1200.000)
>>> env.make_as_global() #doctest: +ELLIPSIS
elo.Elo(..., initial=2000.000, ...)
>>> Rating()
elo.Rating(2000.000)
But if you need just one environment, use :func:setup instead.
""""""
return setup(env=self)

def __repr__(self):
c = type(self)
rc = self.rating_class
if callable(self.k_factor):
f = self.k_factor
k_factor = \'.\'.join([f.__module__, f.__name__])
else:
k_factor = \'%.3f\' % self.k_factor
args = (\'.\'.join([c.__module__, c.__name__]), k_factor,
\'.\'.join([rc.__module__, rc.__name__]), self.initial, self.beta)
return (\'%s(k_factor=%s, rating_class=%s, \'
\'initial=%.3f, beta=%.3f)\' % args)


def rate(rating, series):
return global_env().rate(rating, series)


def adjust(rating, series):
return global_env().adjust(rating, series)


def expect(rating, other_rating):
return global_env().expect(rating, other_rating)


def rate_1vs1(rating1, rating2, drawn=False):
return global_env().rate_1vs1(rating1, rating2, drawn)


def adjust_1vs1(rating1, rating2, drawn=False):
return global_env().adjust_1vs1(rating1, rating2, drawn)


def quality_1vs1(rating1, rating2):
return global_env().quality_1vs1(rating1, rating2)


def setup(k_factor=K_FACTOR, rating_class=RATING_CLASS,
initial=INITIAL, beta=BETA, env=None):
if env is None:
env = Elo(k_factor, rating_class, initial, beta)
global_env.__elo__ = env
return env


def global_env():
""""""Gets the global Elo environment.""""""
try:
global_env.__elo__
except AttributeError:
# setup the default environment
setup()
return global_env.__elo__
# -------------------------------------------------------



def Outputs(data):
return 1.-(1./(1.+np.exp(-data)))


def GPIndividual1(data):
predictions = (np.sinh(((((np.sinh(data[""team1Seed""]) - data[""team2Seed""]) + ((np.tanh(data[""team2Wmax""]) + ((data[""team1Lstd""] + np.minimum( (data[""team1losses""]), (data[""year""])))/2.0))/2.0)) + ((data[""team2Seed""] == (1.0/(1.0 + np.exp(- data[""team2wins""])))).astype(float)))/2.0)) +
((np.cos(((np.round(data[""team2Wmedian""]) <= data[""team1LAverage""]).astype(float))) - np.maximum( ((data[""team1Seed""] * data[""team2Lstd""])), (np.round(np.tanh(np.maximum( (np.maximum( (data[""team2Lstd""]), (data[""team2Wstd""]))), (data[""team1wins""]))))))) / 2.0) +
((np.floor(np.minimum( (((1.732051 == data[""team1Lmax""]).astype(float))), (np.cos(data[""team1WAverage""])))) == ((np.round(((((-(data[""team2LAverage""])) <= data[""team2losses""]).astype(float)) - 2.212120)) <= (data[""team2Wmin""] * data[""team2losses""])).astype(float))).astype(float)) +
np.minimum( ((np.abs(data[""team1Wmedian""]) - ((data[""team1Seed""] >= np.abs(data[""team1Lmedian""])).astype(float)))), (np.round(np.sinh(np.minimum( ((((data[""team2WAverage""] != data[""team1Seed""]).astype(float)) + data[""team2WAverage""])), ((-(((data[""team1Wmin""] >= 2.718282).astype(float)))))))))) +
((np.minimum( (-1.0), (data[""team2Lmax""])) > (data[""team2Wmax""] - np.minimum( (data[""team1Lmax""]), ((data[""team2Wmin""] * (data[""team2Wmax""] - np.minimum( (data[""team2Wmin""]), (np.tanh(np.sin((data[""team1Lmax""] * 2.0))))))))))).astype(float)) +
np.minimum( (((data[""team2WAverage""] >= np.floor(data[""team2Wmin""])).astype(float))), (np.abs(((data[""team1Seed""] >= np.sinh(((0.693147 > np.minimum( (data[""team2Wmedian""]), (((data[""team1Seed""] <= ((data[""team1Wmedian""] <= np.cos(0.693147)).astype(float))).astype(float))))).astype(float)))).astype(float))))) +
np.sin(np.sinh(((((-((((-(0.367879)) <= data[""team1Lmax""]).astype(float)))) + ((data[""team1Wmin""] >= np.floor(data[""team1Seed""])).astype(float)))/2.0) - ((np.sin(data[""team2Wstd""]) > (data[""team2Wmax""] + np.abs(data[""team2""]))).astype(float))))) +
(((((np.sin(data[""team1Lmax""]) > data[""team1Wstd""]).astype(float)) != ((data[""team1Lmax""] == data[""team2wins""]).astype(float))).astype(float)) * (((-(data[""team2Wmin""])) + ((data[""team1Lstd""] + np.minimum( (data[""team2wins""]), (np.minimum( (data[""team1Lmax""]), (data[""team2Wmin""])))))/2.0))/2.0)) +
np.maximum( (np.minimum( (data[""team1Wmin""]), (np.ceil(np.minimum( (np.minimum( (0.138462), (((data[""team1Seed""] >= data[""team2Lmedian""]).astype(float))))), (data[""team2losses""])))))), ((((-(np.maximum( (data[""team2""]), (data[""team2Seed""])))) > ((data[""team1losses""] < 1.414214).astype(float))).astype(float)))) +
np.minimum( (np.maximum( (data[""team1Lmin""]), ((-(((data[""team1wins""] >= np.cos(0.720430)).astype(float))))))), (np.minimum( (np.ceil((data[""team1Wmedian""] / 2.0))), ((-(((data[""team1Wmin""] >= (1.197370 + ((data[""team1Wstd""] < 0.094340).astype(float)))).astype(float)))))))) +
((((-(np.abs(np.abs(((data[""team1""] + (-(0.138462)))/2.0))))) * ((0.367879 >= data[""team2wins""]).astype(float))) > ((data[""team1Wmedian""] + np.maximum( (data[""team1""]), (((0.367879 != data[""team1""]).astype(float)))))/2.0)).astype(float)) +
((3.0 == np.maximum( (np.round(np.maximum( (np.sinh(data[""team2Lmin""])), (data[""team2LAverage""])))), (np.floor(np.maximum( (np.sinh(np.maximum( ((data[""team1""] * 2.0)), (data[""team1Wmedian""])))), (np.sinh((data[""team2Wmedian""] * np.sin(data[""team2WAverage""]))))))))).astype(float)) +
np.minimum( (np.ceil(((data[""team2Wmin""] + ((0.094340 >= data[""team2""]).astype(float)))/2.0))), ((np.minimum( ((data[""team1Lmax""] * data[""team1Wmedian""])), (((data[""team1Wmedian""] < (data[""team1Wmax""] - data[""team2Lmedian""])).astype(float)))) * np.maximum( (data[""team2Seed""]), (data[""team1Wmax""]))))) +
((-(((data[""team2Wmin""] >= ((-((data[""team2Wstd""] + 0.318310))) * (1.0/(1.0 + np.exp(- (-((1.0/(1.0 + np.exp(- 0.318310)))))))))).astype(float)))) * (((1.0/(1.0 + np.exp(- data[""year""]))) * data[""team1Lmin""]) * data[""team2Lstd""])) +
np.floor(np.cos(((data[""team1WAverage""] * np.minimum( ((data[""team1WAverage""] * data[""team2Lmax""])), (data[""team1Lstd""]))) * ((np.sin(np.round(data[""team2Lmax""])) + ((data[""team1LAverage""] != data[""team2WAverage""]).astype(float)))/2.0)))) +
np.ceil(((2.675680 <= np.abs(np.maximum( ((data[""team2""] + np.maximum( (np.abs(data[""team1Seed""])), (data[""team2LAverage""])))), ((0.318310 + (np.minimum( (data[""team1Wmedian""]), (((data[""team2""] != data[""team2LAverage""]).astype(float)))) - data[""team2Lmin""])))))).astype(float))) +
((np.sinh(np.sin(data[""team2Lstd""])) * np.round(np.minimum( (np.sin(np.sinh(np.round(data[""team1Wmin""])))), ((np.minimum( (data[""team1Lstd""]), (np.sin(data[""team2Lstd""]))) * np.sin(data[""team2Lstd""])))))) / 2.0) +
((((data[""team2""] <= ((((data[""team1Lmin""] + 5.428570)/2.0) + (np.cos(np.maximum( (data[""team1Lmin""]), (np.maximum( ((1.0/(1.0 + np.exp(- data[""team1Wmin""])))), (data[""team2LAverage""]))))) / 2.0))/2.0)).astype(float)) <= ((((data[""team2""] + 5.428570)/2.0) <= data[""team1Wmedian""]).astype(float))).astype(float)) +
np.floor(np.cos((data[""team2Lmin""] * np.minimum( ((data[""team1wins""] + ((data[""year""] + data[""team2Wmedian""])/2.0))), ((np.minimum( (data[""year""]), (data[""team1wins""])) + ((((data[""year""] > data[""team2Wmin""]).astype(float)) + data[""team2Wmedian""])/2.0))))))) +
np.minimum( ((((np.minimum( (((data[""team1Wmedian""] >= data[""team1WAverage""]).astype(float))), (data[""team2losses""])) / 2.0) > data[""team1Lmax""]).astype(float))), (((data[""team1Wmedian""] >= (1.0/(1.0 + np.exp(- ((data[""team1Wmax""] > ((data[""team1WAverage""] < (data[""team1Wmedian""] - data[""team2Lmin""])).astype(float))).astype(float)))))).astype(float)))) +
(((0.602941 <= (data[""team2Wmin""] - ((((np.cos(data[""team1Wmedian""]) * 2.0) * 2.0) >= 1.570796).astype(float)))).astype(float)) * np.sin(np.sinh(np.sinh(data[""team2Wstd""])))) +
(data[""team1losses""] * ((data[""team1Wmedian""] >= (np.tanh((((((-(data[""team1Lmin""])) > 0.434294).astype(float)) != ((np.minimum( (data[""team1wins""]), (((0.434294 <= data[""team2losses""]).astype(float)))) < data[""team1Wmedian""]).astype(float))).astype(float))) * 2.0)).astype(float))) +
np.maximum( (((np.minimum( (data[""team1Lmedian""]), (((data[""team2""] > np.maximum( (data[""team2losses""]), (0.585714))).astype(float)))) >= (data[""team1WAverage""] + ((1.414214 > data[""team2Lmin""]).astype(float)))).astype(float))), (((data[""team2Wmin""] < (data[""team1wins""] - 3.141593)).astype(float)))) +
np.round((np.round(((data[""team2Lmedian""] * ((data[""year""] > ((2.675680 + ((data[""team1LAverage""] <= np.maximum( (data[""team1WAverage""]), (0.094340))).astype(float)))/2.0)).astype(float))) * 2.0)) * 2.0)) +
((np.abs(np.sinh(np.abs(data[""team2Lstd""]))) <= (data[""team1Lmax""] * ((data[""team2losses""] <= (-(((((data[""team2Lstd""] <= np.minimum( (data[""team2wins""]), (data[""team2losses""]))).astype(float)) < np.maximum( (data[""team2Wmedian""]), (data[""team1losses""]))).astype(float))))).astype(float)))).astype(float)) +
np.minimum( (np.cos(data[""team1""])), (((((((((((data[""team1""] / 2.0) / 2.0) * 9.869604) <= 0.058823).astype(float)) <= data[""team1Wstd""]).astype(float)) == np.ceil(((data[""team1""] / 2.0) * 9.869604))).astype(float)) - 0.094340))) +
np.maximum( (np.round(((2.212120 <= (data[""team1Wmax""] - ((data[""team2Lmin""] + data[""team2Lmedian""])/2.0))).astype(float)))), (((3.0 < (data[""team2losses""] + np.maximum( ((-(((data[""team2Lmin""] + data[""team2LAverage""])/2.0)))), (data[""team1""])))).astype(float)))) +
((data[""team2wins""] - np.sin(data[""team2Wmin""])) * ((np.maximum( (data[""team2wins""]), (0.840000)) <= np.minimum( (data[""team1Lmax""]), ((np.maximum( (data[""team2Wmax""]), ((data[""team2wins""] * np.floor(data[""team2Wmax""])))) - 0.058823)))).astype(float))) +
((math.tanh((-(1.630430))) > np.sin(np.maximum( (data[""team2Wmin""]), (np.minimum( (np.minimum( (data[""team2Seed""]), (((data[""team1LAverage""] + data[""team1Wstd""])/2.0)))), ((((data[""team2Seed""] <= data[""team2Wmin""]).astype(float)) - data[""team2Lstd""]))))))).astype(float)) +
np.floor(np.cos(((1.570796 + (np.minimum( (data[""team1LAverage""]), (((data[""team1WAverage""] <= ((((((data[""team1Wmin""] + data[""team1WAverage""])/2.0) < ((data[""team2Seed""] >= data[""team1Seed""]).astype(float))).astype(float)) + (data[""team2Lmedian""] * 0.636620))/2.0)).astype(float)))) * 2.0))/2.0))) +
((data[""team2Wmin""] > ((0.318310 + (((1.0/(1.0 + np.exp(- (((data[""team2Seed""] * np.maximum( (data[""team2""]), (data[""team1Lmax""]))) <= ((data[""team2losses""] < data[""team1Lmax""]).astype(float))).astype(float))))) < data[""team2Wmin""]).astype(float))) * 2.0)).astype(float)) +
np.sinh(np.floor((0.367879 - (((((np.minimum( (data[""team1LAverage""]), (np.floor(data[""team2WAverage""]))) == ((2.409090 < -3.0))).astype(float)) + (data[""team2wins""] * np.sin(np.minimum( (data[""team1Lmax""]), (data[""team2WAverage""])))))/2.0) / 2.0)))) +
(((data[""team1Wmax""] < (-2.0 + ((data[""team1wins""] < ((data[""team1Wstd""] - (((data[""team1Wstd""] * data[""team1Wstd""]) < data[""team2Wstd""]).astype(float))) - np.sinh((((data[""team2Wmax""] > data[""team2Lstd""]).astype(float)) * 2.0)))).astype(float)))).astype(float)) * 2.0) +
np.tanh(np.sin(np.round(np.tanh((data[""team2Wmax""] * ((np.round(data[""team2LAverage""]) == ((((data[""team1Wmin""] < data[""team1LAverage""]).astype(float)) > ((data[""team2Wmin""] >= np.cos(np.minimum( (data[""team2Wmax""]), (data[""team2LAverage""])))).astype(float))).astype(float))).astype(float))))))) +
np.minimum( (np.cos(data[""team1losses""])), (((1.197370 < (data[""team2""] * ((data[""team1Lmax""] + np.round(((data[""team1Wstd""] - ((((data[""team1Lmax""] / 2.0) > data[""team2Wmax""]).astype(float)) / 2.0)) / 2.0)))/2.0))).astype(float)))) +
np.abs(((((data[""team1WAverage""] > data[""team2Wstd""]).astype(float)) * 2.0) * (np.tanh(1.732051) * ((((data[""team1wins""] <= 1.732051).astype(float)) < ((np.cos(data[""team2Lmedian""]) > np.abs(np.sin((data[""team2losses""] * 2.0)))).astype(float))).astype(float))))) +
np.minimum( (np.cos((data[""team1Wmin""] * data[""team2WAverage""]))), (np.minimum( (((-(((np.abs(data[""team1WAverage""]) > 1.414214).astype(float)))) / 2.0)), (np.cos(np.maximum( (data[""team1Wmax""]), ((data[""team1wins""] - data[""team2Wmedian""])))))))) +
np.abs(np.minimum( (np.minimum( (((np.abs(data[""team1Lmax""]) > ((1.732051 > (data[""team1Wmedian""] + data[""team1""])).astype(float))).astype(float))), (np.cos(np.minimum( (data[""team2wins""]), ((-(np.abs(data[""team1Lmax""]))))))))), (np.cos(data[""team1LAverage""])))) +
((((((data[""team1WAverage""] >= ((data[""team1Lmax""] < np.sin(1.584910)).astype(float))).astype(float)) < data[""team1Wstd""]).astype(float)) * ((2.302585 < data[""team2Wmedian""]).astype(float))) * 2.0) +
(-(((((((np.ceil(np.minimum( (data[""team2Wmax""]), (data[""team2Wstd""]))) >= (-(data[""team2Lmin""]))).astype(float)) > ((data[""team2Seed""] <= np.ceil((data[""team1Seed""] / 2.0))).astype(float))).astype(float)) + ((5.200000 <= np.floor(data[""team2Wmax""])).astype(float)))/2.0))) +
np.minimum( (((data[""year""] > data[""team1Lmedian""]).astype(float))), (np.minimum( (((data[""team2Wmedian""] < np.cos(data[""team1""])).astype(float))), ((np.maximum( (data[""team1Lmin""]), ((np.round(data[""team2Lmedian""]) + np.round(np.round(data[""team2Lmedian""]))))) / 2.0))))) +
((np.minimum( (np.minimum( (((data[""team1Lmin""] <= np.cos(data[""team2Wmin""])).astype(float))), (data[""team2losses""]))), (data[""team1Seed""])) >= ((((data[""team1LAverage""] < np.cos(((data[""team2Wmin""] < data[""team1LAverage""]).astype(float)))).astype(float)) != ((data[""team1Lmin""] <= np.cos(2.675680)).astype(float))).astype(float))).astype(float)) +
(np.minimum( (np.cos(data[""team2Seed""])), (np.floor(np.cos((data[""team1losses""] * ((1.0/(1.0 + np.exp(- (-(((((data[""team1""] <= np.cos(data[""team2Seed""])).astype(float)) >= np.maximum( (data[""team1Wstd""]), (data[""team1Wmedian""]))).astype(float))))))) * 2.0)))))) * 2.0) +
(3.141593 * (3.141593 * ((np.tanh(data[""team1Wmin""]) >= (((data[""team1losses""] < ((3.141593 + ((np.round(data[""team2Wmin""]) <= (data[""team1losses""] * data[""team2Wmax""])).astype(float)))/2.0)).astype(float)) * 2.0)).astype(float)))) +
np.tanh((data[""team1Lmax""] * (-(((data[""team2Wmax""] > (1.197370 - (((((data[""team2LAverage""] > 1.197370).astype(float)) / 2.0) == ((((((data[""team2Wmax""] != data[""team1Wstd""]).astype(float)) > data[""team1""]).astype(float)) > data[""team1Lmin""]).astype(float))).astype(float)))).astype(float)))))) +
((np.minimum( (data[""team1wins""]), (np.minimum( (data[""team1wins""]), (data[""team1Wmax""])))) > np.abs((((((data[""team1Wmax""] + data[""team1Wmax""])/2.0) * (data[""team2Lmedian""] * data[""team1LAverage""])) < np.cos(np.minimum( (data[""team2Seed""]), (data[""team1Lmedian""])))).astype(float)))).astype(float)) +
(-(np.maximum( (((data[""team1WAverage""] > (np.abs(data[""team1Lmin""]) + 2.212120)).astype(float))), (np.minimum( ((((1.0/(1.0 + math.exp(- 0.693147))) <= (-(data[""team1Lstd""]))).astype(float))), ((data[""team2Lmin""] * 2.212120))))))) +
(np.minimum( (0.585714), (np.maximum( (data[""team2Wmax""]), (np.ceil(data[""team1WAverage""]))))) * ((np.cos(data[""team2Lmin""]) < ((((2.0 > data[""team2wins""]).astype(float)) <= (data[""team1Lmin""] * ((data[""team2Lmin""] > data[""team2wins""]).astype(float)))).astype(float))).astype(float))) +
np.floor(np.cos((data[""team1WAverage""] * np.maximum( (data[""team2Lmax""]), (np.maximum( (((data[""team1Wstd""] + ((data[""team1Lstd""] + data[""team2Lstd""])/2.0))/2.0)), (np.sin(((((data[""team2Lstd""] <= data[""team1Lstd""]).astype(float)) + -2.0)/2.0))))))))) +
(((((np.round(data[""team2Lmax""]) >= ((((1.584910 <= data[""team1wins""]).astype(float)) >= ((((data[""team2Lmax""] <= 2.718282).astype(float)) >= data[""team1Lstd""]).astype(float))).astype(float))).astype(float)) < np.minimum( (data[""team2Lmax""]), ((1.630430 + data[""team1""])))).astype(float)) / 2.0) +
np.sin(np.minimum( ((data[""team1Wmedian""] * (3.141593 * np.sinh(np.maximum( (data[""team1""]), (data[""team2Lstd""])))))), ((-3.0 * ((((data[""team1""] >= data[""team1LAverage""]).astype(float)) >= ((data[""team1Wstd""] > data[""team1WAverage""]).astype(float))).astype(float)))))) +
((0.094340 >= np.abs(np.cos((data[""team1Wmax""] - (((((data[""team1Lmedian""] >= ((data[""team1""] >= (-(np.ceil(data[""team2Lmax""])))).astype(float))).astype(float)) != np.ceil(np.ceil(data[""team2Wmin""]))).astype(float)) * data[""team1Lmedian""]))))).astype(float)) +
((np.abs(data[""team1Wmax""]) <= (data[""team1Seed""] - np.maximum( (np.abs(data[""year""])), (((((np.abs(data[""year""]) > np.maximum( ((data[""team1Wmin""] * 2.0)), (data[""team2wins""]))).astype(float)) < np.ceil(np.abs(data[""team2Wmin""]))).astype(float)))))).astype(float)) +
(((-2.0 < data[""team2""]).astype(float)) * np.abs((((data[""team2""] < data[""team1Seed""]).astype(float)) * (((data[""team2losses""] <= (-2.0 / 2.0)).astype(float)) + ((data[""team2Lstd""] * ((1.570796 < data[""year""]).astype(float))) * 2.0))))) +
np.minimum( ((((data[""team2Lstd""] * ((data[""team1Wmax""] > 1.0).astype(float))) + (-((((np.maximum( (data[""team1Lmin""]), (data[""team1Wmedian""])) <= data[""team2Lstd""]).astype(float)) / 2.0))))/2.0)), (((data[""team1LAverage""] < ((data[""team1LAverage""] >= 0.602941).astype(float))).astype(float)))) +
(-(((((data[""team1Lmin""] <= 0.840000).astype(float)) <= (-((data[""team2Lmedian""] * (data[""team2Wmax""] + (data[""team2Lmin""] + ((data[""team2Wstd""] < (data[""team2Lmedian""] * np.sinh(data[""team2Lmin""]))).astype(float)))))))).astype(float)))) +
((np.minimum( (data[""team2wins""]), (data[""team1Wstd""])) > ((1.197370 >= (np.minimum( ((data[""team1Wmin""] * data[""team1Wstd""])), (data[""team1Wstd""])) - np.minimum( (np.minimum( (data[""team2Lmax""]), (data[""team1Lmin""]))), (data[""team2Wmax""])))).astype(float))).astype(float)) +
(np.cos((-(data[""team2Wmin""]))) * (((data[""team2losses""] > data[""team1Lmedian""]).astype(float)) * np.sin((data[""team2Lstd""] * np.minimum( (np.sinh(data[""team1""])), ((8.0 - data[""team2losses""]))))))) +
(((-(data[""team1WAverage""])) >= (3.0 * np.maximum( (np.maximum( (data[""team1Lmax""]), (data[""team2Lstd""]))), (((data[""team2""] < ((1.0/(1.0 + np.exp(- np.tanh((1.0/(1.0 + np.exp(- ((data[""team2Wstd""] > data[""team1LAverage""]).astype(float))))))))) * 2.0)).astype(float)))))).astype(float)) +
np.sinh(np.sinh(((np.maximum( (data[""team2losses""]), ((data[""team2Wstd""] * data[""year""]))) > ((3.0 - np.cos(((((data[""team2Wstd""] > 2.409090).astype(float)) <= data[""team1Wmin""]).astype(float)))) - ((data[""team2Wstd""] > 2.409090).astype(float)))).astype(float)))) +
np.sinh((-(((((((-(((((data[""team1Seed""] < data[""team2Lstd""]).astype(float)) < data[""team2Wmin""]).astype(float)))) >= np.cos((-(data[""team2""])))).astype(float)) >= ((data[""team1Seed""] < (2.0 - data[""team1losses""])).astype(float))).astype(float)) / 2.0)))) +
(np.minimum( (((data[""team2WAverage""] > data[""team1Wmax""]).astype(float))), (((-(((data[""team1""] - (-(((data[""team1wins""] >= ((0.094340 > data[""team1Wmax""]).astype(float))).astype(float))))) - ((0.094340 > data[""team1Wmax""]).astype(float))))) / 2.0))) / 2.0) +
(((data[""team1""] >= (1.0/(1.0 + np.exp(- np.tanh(data[""team1Wstd""]))))).astype(float)) * np.tanh(((data[""team1Lmin""] > ((1.0/(1.0 + np.exp(- np.tanh(data[""team1Lmin""])))) * np.minimum( ((((1.0/(1.0 + np.exp(- data[""team1""]))) + data[""team1Wstd""])/2.0)), (data[""team2""])))).astype(float)))) +
(((((data[""team2Wmax""] * data[""team2Wmin""]) >= ((np.floor(data[""team2Wmin""]) == ((data[""team1WAverage""] <= data[""team1WAverage""]).astype(float))).astype(float))).astype(float)) + (-(np.round(np.sin((1.0/(1.0 + np.exp(- np.abs((data[""team1Wmin""] - np.cos(data[""team2Wmin""])))))))))))/2.0))

return Outputs(predictions)


def Aggregate(teamcompactresults1,
teamcompactresults2,
merged_results,
regularseasoncompactresults):
winningteam1compactresults = pd.merge(how=\'left\',
left=teamcompactresults1,
right=regularseasoncompactresults,
left_on=[\'year\', \'team1\'],
right_on=[\'Season\', \'Wteam\'])
winningteam1compactresults.drop([\'Season\',
\'Daynum\',
\'Wteam\',
\'Lteam\',
\'Lscore\',
\'Wloc\',
\'Numot\'],
inplace=True,
axis=1)
grpwinningteam1resultsaverage = \\
winningteam1compactresults.groupby([\'year\', \'team1\']).mean()
winningteam1resultsaverage = grpwinningteam1resultsaverage.reset_index()
winningteam1resultsaverage.rename(columns={\'Wscore\': \'team1WAverage\'},
inplace=True)
grpwinningteam1resultsmin = \\
winningteam1compactresults.groupby([\'year\', \'team1\']).min()
winningteam1resultsmin = grpwinningteam1resultsmin.reset_index()
winningteam1resultsmin.rename(columns={\'Wscore\': \'team1Wmin\'},
inplace=True)
grpwinningteam1resultsmax = \\
winningteam1compactresults.groupby([\'year\', \'team1\']).max()
winningteam1resultsmax = grpwinningteam1resultsmax.reset_index()
winningteam1resultsmax.rename(columns={\'Wscore\': \'team1Wmax\'},
inplace=True)
grpwinningteam1resultsmedian = \\
winningteam1compactresults.groupby([\'year\', \'team1\']).median()
winningteam1resultsmedian = grpwinningteam1resultsmedian.reset_index()
winningteam1resultsmedian.rename(columns={\'Wscore\': \'team1Wmedian\'},
inplace=True)
grpwinningteam1resultsstd = \\
winningteam1compactresults.groupby([\'year\', \'team1\']).std()
winningteam1resultsstd = grpwinningteam1resultsstd.reset_index()
winningteam1resultsstd.rename(columns={\'Wscore\': \'team1Wstd\'},
inplace=True)
losingteam1compactresults = pd.merge(how=\'left\',
left=teamcompactresults1,
right=regularseasoncompactresults,
left_on=[\'year\', \'team1\'],
right_on=[\'Season\', \'Lteam\'])
losingteam1compactresults.drop([\'Season\',
\'Daynum\',
\'Wteam\',
\'Lteam\',
\'Wscore\',
\'Wloc\',
\'Numot\'],
inplace=True,
axis=1)
grplosingteam1resultsaverage = \\
losingteam1compactresults.groupby([\'year\', \'team1\']).mean()
losingteam1resultsaverage = grplosingteam1resultsaverage.reset_index()
losingteam1resultsaverage.rename(columns={\'Lscore\': \'team1LAverage\'},
inplace=True)
grplosingteam1resultsmin = \\
losingteam1compactresults.groupby([\'year\', \'team1\']).min()
losingteam1resultsmin = grplosingteam1resultsmin.reset_index()
losingteam1resultsmin.rename(columns={\'Lscore\': \'team1Lmin\'},
inplace=True)
grplosingteam1resultsmax = \\
losingteam1compactresults.groupby([\'year\', \'team1\']).max()
losingteam1resultsmax = grplosingteam1resultsmax.reset_index()
losingteam1resultsmax.rename(columns={\'Lscore\': \'team1Lmax\'},
inplace=True)
grplosingteam1resultsmedian = \\
losingteam1compactresults.groupby([\'year\', \'team1\']).median()
losingteam1resultsmedian = grplosingteam1resultsmedian.reset_index()
losingteam1resultsmedian.rename(columns={\'Lscore\': \'team1Lmedian\'},
inplace=True)
grplosingteam1resultsstd = \\
losingteam1compactresults.groupby([\'year\', \'team1\']).std()
losingteam1resultsstd = grplosingteam1resultsstd.reset_index()
losingteam1resultsstd.rename(columns={\'Lscore\': \'team1Lstd\'},
inplace=True)
winningteam2compactresults = pd.merge(how=\'left\',
left=teamcompactresults2,
right=regularseasoncompactresults,
left_on=[\'year\', \'team2\'],
right_on=[\'Season\', \'Wteam\'])
winningteam2compactresults.drop([\'Season\',
\'Daynum\',
\'Wteam\',
\'Lteam\',
\'Lscore\',
\'Wloc\',
\'Numot\'],
inplace=True,
axis=1)
grpwinningteam2resultsaverage = \\
winningteam2compactresults.groupby([\'year\', \'team2\']).mean()
winningteam2resultsaverage = grpwinningteam2resultsaverage.reset_index()
winningteam2resultsaverage.rename(columns={\'Wscore\': \'team2WAverage\'},
inplace=True)
grpwinningteam2resultsmin = \\
winningteam2compactresults.groupby([\'year\', \'team2\']).min()
winningteam2resultsmin = grpwinningteam2resultsmin.reset_index()
winningteam2resultsmin.rename(columns={\'Wscore\': \'team2Wmin\'},
inplace=True)
grpwinningteam2resultsmax = \\
winningteam2compactresults.groupby([\'year\', \'team2\']).max()
winningteam2resultsmax = grpwinningteam2resultsmax.reset_index()
winningteam2resultsmax.rename(columns={\'Wscore\': \'team2Wmax\'},
inplace=True)
grpwinningteam2resultsmedian = \\
winningteam2compactresults.groupby([\'year\', \'team2\']).median()
winningteam2resultsmedian = grpwinningteam2resultsmedian.reset_index()
winningteam2resultsmedian.rename(columns={\'Wscore\': \'team2Wmedian\'},
inplace=True)
grpwinningteam2resultsstd = \\
winningteam2compactresults.groupby([\'year\', \'team2\']).std()
winningteam2resultsstd = grpwinningteam2resultsstd.reset_index()
winningteam2resultsstd.rename(columns={\'Wscore\': \'team2Wstd\'},
inplace=True)
losingteam2compactresults = pd.merge(how=\'left\',
left=teamcompactresults2,
right=regularseasoncompactresults,
left_on=[\'year\', \'team2\'],
right_on=[\'Season\', \'Lteam\'])
losingteam2compactresults.drop([\'Season\',
\'Daynum\',
\'Wteam\',
\'Lteam\',
\'Wscore\',
\'Wloc\',
\'Numot\'],
inplace=True,
axis=1)
grplosingteam2resultsaverage = \\
losingteam2compactresults.groupby([\'year\', \'team2\']).mean()
losingteam2resultsaverage = grplosingteam2resultsaverage.reset_index()
losingteam2resultsaverage.rename(columns={\'Lscore\': \'team2LAverage\'},
inplace=True)
grplosingteam2resultsmin = \\
losingteam2compactresults.groupby([\'year\', \'team2\']).min()
losingteam2resultsmin = grplosingteam2resultsmin.reset_index()
losingteam2resultsmin.rename(columns={\'Lscore\': \'team2Lmin\'},
inplace=True)
grplosingteam2resultsmax = \\
losingteam2compactresults.groupby([\'year\', \'team2\']).max()
losingteam2resultsmax = grplosingteam2resultsmax.reset_index()
losingteam2resultsmax.rename(columns={\'Lscore\': \'team2Lmax\'},
inplace=True)
grplosingteam2resultsmedian = \\
losingteam2compactresults.groupby([\'year\', \'team2\']).median()
losingteam2resultsmedian = grplosingteam2resultsmedian.reset_index()
losingteam2resultsmedian.rename(columns={\'Lscore\': \'team2Lmedian\'},
inplace=True)
grplosingteam2resultsstd = \\
losingteam2compactresults.groupby([\'year\', \'team2\']).std()
losingteam2resultsstd = grplosingteam2resultsstd.reset_index()
losingteam2resultsstd.rename(columns={\'Lscore\': \'team2Lstd\'},
inplace=True)
agg_results = pd.merge(how=\'left\',
left=merged_results,
right=winningteam1resultsaverage,
left_on=[\'year\', \'team1\'],
right_on=[\'year\', \'team1\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=losingteam1resultsaverage,
left_on=[\'year\', \'team1\'],
right_on=[\'year\', \'team1\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=winningteam1resultsmin,
left_on=[\'year\', \'team1\'],
right_on=[\'year\', \'team1\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=losingteam1resultsmin,
left_on=[\'year\', \'team1\'],
right_on=[\'year\', \'team1\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=winningteam1resultsmax,
left_on=[\'year\', \'team1\'],
right_on=[\'year\', \'team1\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=losingteam1resultsmax,
left_on=[\'year\', \'team1\'],
right_on=[\'year\', \'team1\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=winningteam1resultsmedian,
left_on=[\'year\', \'team1\'],
right_on=[\'year\', \'team1\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=losingteam1resultsmedian,
left_on=[\'year\', \'team1\'],
right_on=[\'year\', \'team1\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=winningteam1resultsstd,
left_on=[\'year\', \'team1\'],
right_on=[\'year\', \'team1\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=losingteam1resultsstd,
left_on=[\'year\', \'team1\'],
right_on=[\'year\', \'team1\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=winningteam2resultsaverage,
left_on=[\'year\', \'team2\'],
right_on=[\'year\', \'team2\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=losingteam2resultsaverage,
left_on=[\'year\', \'team2\'],
right_on=[\'year\', \'team2\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=winningteam2resultsmin,
left_on=[\'year\', \'team2\'],
right_on=[\'year\', \'team2\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=losingteam2resultsmin,
left_on=[\'year\', \'team2\'],
right_on=[\'year\', \'team2\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=winningteam2resultsmax,
left_on=[\'year\', \'team2\'],
right_on=[\'year\', \'team2\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=losingteam2resultsmax,
left_on=[\'year\', \'team2\'],
right_on=[\'year\', \'team2\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=winningteam2resultsmedian,
left_on=[\'year\', \'team2\'],
right_on=[\'year\', \'team2\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=losingteam2resultsmedian,
left_on=[\'year\', \'team2\'],
right_on=[\'year\', \'team2\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=winningteam2resultsstd,
left_on=[\'year\', \'team2\'],
right_on=[\'year\', \'team2\'])
agg_results = pd.merge(how=\'left\',
left=agg_results,
right=losingteam2resultsstd,
left_on=[\'year\', \'team2\'],
right_on=[\'year\', \'team2\'])
return agg_results


def GrabData():
tourneyresults = pd.read_csv(\'../input/TourneyCompactResults.csv\')
tourneyseeds = pd.read_csv(\'../input/TourneySeeds.csv\')
regularseasoncompactresults = \\
pd.read_csv(\'../input/RegularSeasonCompactResults.csv\')
sample = pd.read_csv(\'../input/SampleSubmission.csv\')
results = pd.DataFrame()
results[\'year\'] = tourneyresults.Season
results[\'team1\'] = np.minimum(tourneyresults.Wteam, tourneyresults.Lteam)
results[\'team2\'] = np.maximum(tourneyresults.Wteam, tourneyresults.Lteam)
results[\'result\'] = (tourneyresults.Wteam <
tourneyresults.Lteam).astype(int)
merged_results = pd.merge(left=results,
right=tourneyseeds,
left_on=[\'year\', \'team1\'],
right_on=[\'Season\', \'Team\'])
merged_results.drop([\'Season\', \'Team\'], inplace=True, axis=1)
merged_results.rename(columns={\'Seed\': \'team1Seed\'}, inplace=True)
merged_results = pd.merge(left=merged_results,
right=tourneyseeds,
left_on=[\'year\', \'team2\'],
right_on=[\'Season\', \'Team\'])
merged_results.drop([\'Season\', \'Team\'], inplace=True, axis=1)
merged_results.rename(columns={\'Seed\': \'team2Seed\'}, inplace=True)
merged_results[\'team1Seed\'] = \\
merged_results[\'team1Seed\'].apply(lambda x: str(x)[1:3])
merged_results[\'team2Seed\'] = \\
merged_results[\'team2Seed\'].apply(lambda x: str(x)[1:3])
merged_results = merged_results.astype(int)
winsbyyear = regularseasoncompactresults[[\'Season\', \'Wteam\']].copy()
winsbyyear[\'wins\'] = 1
wins = winsbyyear.groupby([\'Season\', \'Wteam\']).sum()
wins = wins.reset_index()
lossesbyyear = regularseasoncompactresults[[\'Season\', \'Lteam\']].copy()
lossesbyyear[\'losses\'] = 1
losses = lossesbyyear.groupby([\'Season\', \'Lteam\']).sum()
losses = losses.reset_index()
winsteam1 = wins.copy()
winsteam1.rename(columns={\'Season\': \'year\',
\'Wteam\': \'team1\',
\'wins\': \'team1wins\'}, inplace=True)
winsteam2 = wins.copy()
winsteam2.rename(columns={\'Season\': \'year\',
\'Wteam\': \'team2\',
\'wins\': \'team2wins\'}, inplace=True)
lossesteam1 = losses.copy()
lossesteam1.rename(columns={\'Season\': \'year\',
\'Lteam\': \'team1\',
\'losses\': \'team1losses\'}, inplace=True)
lossesteam2 = losses.copy()
lossesteam2.rename(columns={\'Season\': \'year\',
\'Lteam\': \'team2\',
\'losses\': \'team2losses\'}, inplace=True)
merged_results = pd.merge(how=\'left\',
left=merged_results,
right=winsteam1,
left_on=[\'year\', \'team1\'],
right_on=[\'year\', \'team1\'])
merged_results = pd.merge(how=\'left\',
left=merged_results,
right=lossesteam1,
left_on=[\'year\', \'team1\'],
right_on=[\'year\', \'team1\'])
merged_results = pd.merge(how=\'left\',
left=merged_results,
right=winsteam2,
left_on=[\'year\', \'team2\'],
right_on=[\'year\', \'team2\'])
merged_results = pd.merge(how=\'left\',
left=merged_results,
right=lossesteam2,
left_on=[\'year\', \'team2\'],
right_on=[\'year\', \'team2\'])
teamcompactresults1 = merged_results[[\'year\', \'team1\']].copy()
teamcompactresults2 = merged_results[[\'year\', \'team2\']].copy()

train = Aggregate(teamcompactresults1,
teamcompactresults2,
merged_results,
regularseasoncompactresults)

sample[\'year\'] = sample.Id.apply(lambda x: str(x)[:4]).astype(int)
sample[\'team1\'] = sample.Id.apply(lambda x: str(x)[5:9]).astype(int)
sample[\'team2\'] = sample.Id.apply(lambda x: str(x)[10:14]).astype(int)

merged_results = pd.merge(how=\'left\',
left=sample,
right=tourneyseeds,
left_on=[\'year\', \'team1\'],
right_on=[\'Season\', \'Team\'])
merged_results.drop([\'Season\', \'Team\'], inplace=True, axis=1)
merged_results.rename(columns={\'Seed\': \'team1Seed\'}, inplace=True)
merged_results = pd.merge(how=\'left\',
left=merged_results,
right=tourneyseeds,
left_on=[\'year\', \'team2\'],
right_on=[\'Season\', \'Team\'])
merged_results.drop([\'Season\', \'Team\'], inplace=True, axis=1)
merged_results.rename(columns={\'Seed\': \'team2Seed\'}, inplace=True)
merged_results[\'team1Seed\'] = \\
merged_results[\'team1Seed\'].apply(lambda x: str(x)[1:3]).astype(int)
merged_results[\'team2Seed\'] = \\
merged_results[\'team2Seed\'].apply(lambda x: str(x)[1:3]).astype(int)
merged_results = pd.merge(how=\'left\',
left=merged_results,
right=winsteam1,
left_on=[\'year\', \'team1\'],
right_on=[\'year\', \'team1\'])
merged_results = pd.merge(how=\'left\',
left=merged_results,
right=lossesteam1,
left_on=[\'year\', \'team1\'],
right_on=[\'year\', \'team1\'])
merged_results = pd.merge(how=\'left\',
left=merged_results,
right=winsteam2,
left_on=[\'year\', \'team2\'],
right_on=[\'year\', \'team2\'])
merged_results = pd.merge(how=\'left\',
left=merged_results,
right=lossesteam2,
left_on=[\'year\', \'team2\'],
right_on=[\'year\', \'team2\'])

teamcompactresults1 = merged_results[[\'year\', \'team1\']].copy()
teamcompactresults2 = merged_results[[\'year\', \'team2\']].copy()

test = Aggregate(teamcompactresults1,
teamcompactresults2,
merged_results,
regularseasoncompactresults)

return train, test


if __name__ == ""__main__"":

train = pd.read_csv(\'../input/TourneyCompactResults.csv\')

elo = Elo(125)

team = {}

for index, row in train.iterrows():
t1 = row[\'Wteam\']
t2 = row[\'Lteam\']
if not t1 in team: team[t1] = 1000.0
if not t2 in team: team[t2] = 1000.0

(team[t1], team[t2]) = elo.rate_1vs1(team[t1], team[t2])
#print(team)

elo = Elo(140)

team2 = {}

for index, row in train.iterrows():
t1 = row[\'Wteam\']
t2 = row[\'Lteam\']
if not t1 in team2: team2[t1] = 1020.0
if not t2 in team2: team2[t2] = 1020.0

(team2[t1], team2[t2]) = elo.rate_1vs1(team2[t1], team2[t2])



train, test = GrabData()
trainlabels = train.result.values
train.drop(\'result\', inplace=True, axis=1)
train.fillna(-1, inplace=True)
testids = test.Id.values
test.drop([\'Id\', \'Pred\'], inplace=True, axis=1)
test.fillna(-1, inplace=True)
ss = StandardScaler()
train[train.columns] = np.round(ss.fit_transform(train), 6)
predictions = GPIndividual1(train)
predictions.fillna(1, inplace=True)
print(log_loss(trainlabels, np.clip(predictions.values, .01, .99)))
test[test.columns] = np.round(ss.transform(test), 6)
predictions = GPIndividual1(test)
predictions.fillna(1, inplace=True)


preds = pd.read_csv(\'../input/SampleSubmission.csv\')
prediction = np.zeros((preds.shape[0], 1))
i = 0
for index, row in preds.iterrows():
p = list(map(int, str.split(str(row[\'Id\']), \'_\')))
#prediction[i] = 0.5 + 0.3*(team[p[1]] - team[p[2]]) / 480 + 0.7*(team2[p[1]] - team2[p[2]])/520
prediction[i] = predictions.values[i]
i += 1

preds[\'Pred\'] = np.clip(prediction, 0.07, 0.93)
preds.to_csv(\'Prediction.csv\', index=False)",No,5,53.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output([""ls"", ""../input""]).decode(""utf8""))

# Any results you write to the current directory are saved as output.",No,5,88.0 "train = pd.read_csv(""../input/train.csv"") test = pd.read_csv(""../input/test.csv"")",No,5,45.0 "def loadData(df, test = None): dt = pd.to_datetime(df.datetime).dt df[""Year""] = dt.year df[""Month""] = dt.month df[""Day""] = dt.day df[""Hour""] = dt.hour df.drop(""datetime"", axis = 1, inplace = True) if not test: df.drop(""casual"", axis = 1, inplace = True) df.drop(""registered"", axis = 1, inplace = True) if test: y = None else: y = df[""count""] df.drop(""count"", axis = 1, inplace = True) X = df return X, y ",No,4,21.0 "X, y = loadData(train)",No,5,21.0 new_y = np.log(y + 1),No,4,21.0 "# use a full grid over all parameters
\'\'\'
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from time import time
param_grid = {""max_depth"": [3, None],
""max_features"": [1, 3, 10],
""min_samples_split"": [1, 3, 10],
""min_samples_leaf"": [1, 3, 10],
""bootstrap"": [True, False]}

clf = RandomForestRegressor(n_estimators=20)
# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)

print(""GridSearchCV took %.2f seconds for %d candidate parameter settings.""
% (time() - start, len(grid_search.grid_scores_)))
#report(grid_search.grid_scores_)
\'\'\'",No,5,6.0 "# RF X_test, _ = loadData(test, test = True) rf = RandomForestRegressor().fit(X, new_y) prediction = rf.predict(X_test) ",No,4,48.0 "### xgb #import xgboost as xgb #gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(train_X, train_y) #predictions = gbm.predict(test_X)",No,4,48.0 "### Get submission sample = pd.read_csv(""../input/sampleSubmission.csv"") submission = pd.DataFrame() submission[""datetime""] = sample[""datetime""] submission[""count""] = pd.Series(prediction) submission.to_csv(""sub.csv"", index = False)",No,3,55.0 "print(check_output([""head"", ""../input/sampleSubmission.csv""]).decode(""utf8""))",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output([""ls"", ""../input""]).decode(""utf8""))

# Any results you write to the current directory are saved as output.",No,5,88.0 "import random def random_sampler(filename, k): sample = [] with open(filename, 'rb') as f: f.seek(0, 2) filesize = f.tell() random_set = sorted(random.sample(range(filesize), k)) for i in range(k): f.seek(random_set[i]) # Skip current line (because we might be in the middle of a line) f.readline() # Append the next line to the sample set sample.append(f.readline().rstrip()) return sample",No,5,15.0 "TRAIN_SAMPLES = 5*10**6 train_sample = random_sampler('../input/train.csv', TRAIN_SAMPLES)",No,5,15.0 "train_sample[0].decode().split(',')",No,4,78.0 "train_sample_ = [row.decode().split("","") for row in train_sample]",No,5,78.0 train = pd.DataFrame(train_sample_),No,5,12.0 "train_df = pd.read_csv('../input/train.csv', nrows=1) train.columns = train_df.columns",No,3,45.0 train = train.apply(pd.to_numeric),No,5,16.0 train.info(),No,5,40.0 train['Demanda_uni_equil'] = np.log1p(train['Demanda_uni_equil']),No,4,21.0 "x_cols = train.columns x_cols = x_cols.drop(['Venta_uni_hoy', 'Venta_hoy', 'Dev_uni_proxima', 'Dev_proxima', 'Demanda_uni_equil']) print(x_cols)",No,5,10.0 "from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor()",No,5,4.0 "model.fit(train[x_cols], train['Demanda_uni_equil'])",No,5,7.0 "test = pd.read_csv('../input/test.csv') test.info()",No,4,45.0 test['Demanda_uni_equil'] = np.expm1(model.predict(test[x_cols])),No,5,48.0 "test[['id', 'Demanda_uni_equil']].to_csv('predictions_rf_random_sampling.csv', index=False)",No,5,25.0 "import pandas as pd
import numpy as np
from scipy import sparse as ssp
import pylab as plt
from sklearn.preprocessing import LabelEncoder,LabelBinarizer,MinMaxScaler,OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import TruncatedSVD,NMF,PCA,FactorAnalysis
from sklearn.feature_selection import SelectFromModel,SelectPercentile,f_classif
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss,roc_auc_score
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.cross_validation import StratifiedKFold,KFold
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint,Callback
from keras import backend as K
from keras.layers import Input, Embedding, LSTM, Dense,Flatten, Dropout, merge,Convolution1D,MaxPooling1D,Lambda,AveragePooling1D,Reshape
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD
from keras.layers.advanced_activations import PReLU,LeakyReLU,ELU,SReLU
from keras.models import Model

seed = 1
np.random.seed(seed)
dim = 32
hidden=64

path = ""../input/""

class AucCallback(Callback): #inherits from Callback

def __init__(self, validation_data=(), patience=25,is_regression=True,best_model_name=\'best_keras.mdl\',feval=\'roc_auc_score\',batch_size=1024*8):
super(Callback, self).__init__()

self.patience = patience
self.X_val, self.y_val = validation_data #tuple of validation X and y
self.best = -np.inf
self.wait = 0 #counter for patience
self.best_model=None
self.best_model_name = best_model_name
self.is_regression = is_regression
self.y_val = self.y_val#.astype(np.int)
self.feval = feval
self.batch_size = batch_size
def on_epoch_end(self, epoch, logs={}):
p = self.model.predict(self.X_val,batch_size=self.batch_size, verbose=0)#.ravel()
if self.feval==\'roc_auc_score\':
current = roc_auc_score(self.y_val,p)

if current > self.best:
self.best = current
self.wait = 0
self.model.save_weights(self.best_model_name,overwrite=True)


else:
if self.wait >= self.patience:
self.model.stop_training = True
print(\'Epoch %05d: early stopping\' % (epoch))


self.wait += 1 #incremental the number of times without improvement
print(\'Epoch %d Auc: %f | Best Auc: %f \
\' % (epoch,current,self.best))


def make_batches(size, batch_size):
nb_batch = int(np.ceil(size/float(batch_size)))
return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)]



def main():
train = pd.read_csv(path+\'act_train.csv\')
test = pd.read_csv(path+\'act_test.csv\')
people = pd.read_csv(path+\'people.csv\')
columns = people.columns
test[\'outcome\'] = np.nan
data = pd.concat([train,test])

data = pd.merge(data,people,how=\'left\',on=\'people_id\').fillna(\'missing\')
train = data[:train.shape[0]]
test = data[train.shape[0]:]



columns = train.columns.tolist()
columns.remove(\'activity_id\')
columns.remove(\'outcome\')
data = pd.concat([train,test])
for c in columns:
data[c] = LabelEncoder().fit_transform(data[c].values)

train = data[:train.shape[0]]
test = data[train.shape[0]:]

data = pd.concat([train,test])
columns = train.columns.tolist()
columns.remove(\'activity_id\')
columns.remove(\'outcome\')
flatten_layers = []
inputs = []
count=0
for c in columns:

inputs_c = Input(shape=(1,), dtype=\'int32\')

num_c = len(np.unique(data[c].values))

embed_c = Embedding(
num_c,
dim,
dropout=0.2,
input_length=1
)(inputs_c)
flatten_c= Flatten()(embed_c)

inputs.append(inputs_c)
flatten_layers.append(flatten_c)
count+=1

flatten = merge(flatten_layers,mode=\'concat\')
reshaped_flatten = Reshape((count,dim))(flatten)

conv_1 = Convolution1D(nb_filter=16,
filter_length=3,
border_mode=\'same\',
activation=\'relu\',
subsample_length=1)(reshaped_flatten)
pool_1 = MaxPooling1D(pool_length=int(count/2))(conv_1)

flatten = Flatten()(pool_1)


fc1 = Dense(hidden,activation=\'relu\')(flatten)
dp1 = Dropout(0.5)(fc1)

outputs = Dense(1,activation=\'sigmoid\')(dp1)

model = Model(input=inputs, output=outputs)
model.compile(
optimizer=\'adam\',
loss=\'binary_crossentropy\',
)

del data

X = train[columns].values
X_t = test[columns].values
y = train[""outcome""].values
people_id = train[""people_id""].values
activity_id = test[\'activity_id\']
del train
del test

skf = StratifiedKFold(y, n_folds=4, shuffle=True, random_state=seed)
for ind_tr, ind_te in skf:
X_train = X[ind_tr]
X_test = X[ind_te]

y_train = y[ind_tr]
y_test = y[ind_te]
break

X_train = [X_train[:,i] for i in range(X.shape[1])]
X_test = [X_test[:,i] for i in range(X.shape[1])]

del X

model_name = \'mlp_residual_%s_%s.hdf5\'%(dim,hidden)
model_checkpoint = ModelCheckpoint(model_name, monitor=\'val_loss\', save_best_only=True)
auc_callback = AucCallback(validation_data=(X_test,y_test), patience=5,is_regression=True,best_model_name=path+\'best_keras.mdl\',feval=\'roc_auc_score\')

nb_epoch = 2

batch_size = 1024*8
load_model = False

if load_model:
print(\'Load Model\')
model.load_weights(path+model_name)
# model.load_weights(path+\'best_keras.mdl\')

model.fit(
X_train,
y_train,
batch_size=batch_size,
nb_epoch=nb_epoch,
verbose=1,
shuffle=True,
validation_data=[X_test,y_test],
# callbacks = [
# model_checkpoint,
# auc_callback,
# ],
)

# model.load_weights(model_name)
# model.load_weights(path+\'best_keras.mdl\')

y_preds = model.predict(X_test,batch_size=1024*8)
# print(\'auc\',roc_auc_score(y_test,y_preds))

# print(\'Make submission\')
X_t = [X_t[:,i] for i in range(X_t.shape[1])]
outcome = model.predict(X_t,batch_size=1024*8)
submission = pd.DataFrame()
submission[\'activity_id\'] = activity_id
submission[\'outcome\'] = outcome
submission.to_csv(\'submission_residual_%s_%s.csv\'%(dim,hidden),index=False)

main()
",No,3,45.0 "%matplotlib inline import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import matplotlib.cm as cm import os from sklearn.preprocessing import LabelEncoder from sklearn.cross_validation import KFold from sklearn.metrics import log_loss",No,5,23.0 "gatrain = pd.read_csv('../input/gender_age_train.csv') gatest = pd.read_csv('../input/gender_age_test.csv') gatrain.head(3)",No,4,45.0 "letarget = LabelEncoder().fit(gatrain.group.values) y = letarget.transform(gatrain.group.values) n_classes = len(letarget.classes_)",No,4,20.0 "phone = pd.read_csv('../input/phone_brand_device_model.csv',encoding='utf-8') phone.head(3)",No,4,45.0 "phone = phone.drop_duplicates('device_id', keep='first')",No,5,19.0 "lebrand = LabelEncoder().fit(phone.phone_brand) phone['brand'] = lebrand.transform(phone.phone_brand) m = phone.phone_brand.str.cat(phone.device_model) lemodel = LabelEncoder().fit(m) phone['model'] = lemodel.transform(m)",No,5,20.0 "train = gatrain.merge(phone[['device_id','brand','model']], how='left',on='device_id')",No,5,32.0 "class GenderAgeGroupProb(object): def __init__(self): pass def fit(self, df, by, n_smoothing, weights): self.by = by self.n_smoothing = n_smoothing self.weights = np.divide(weights,sum(weights)) self.classes_ = sorted(df['group'].unique()) self.n_classes_ = len(self.classes_) self.group_freq = df['group'].value_counts().sort_index()/df.shape[0] self.prob_by = [] for i,b in enumerate(self.by): c = df.groupby([b,'group']).size().unstack().fillna(0) total = c.sum(axis=1) prob = (c.add(self.n_smoothing[i]*self.group_freq)).div(total+self.n_smoothing[i], axis=0) self.prob_by.append(prob) return self def predict_proba(self, df): pred = pd.DataFrame(np.zeros((len(df.index),self.n_classes_)),columns=self.classes_,index=df.index) pred_by = [] for i,b in enumerate(self.by): pred_by.append(df[[b]].merge(self.prob_by[i], how='left', left_on=b, right_index=True).fillna(self.group_freq)[self.classes_]) pred = pred.radd(pred_by[i].values*self.weights[i]) pred.loc[pred.iloc[:,0].isnull(),:] = self.group_freq return pred[self.classes_].values def score(ptrain, by, n_smoothing, weights=[0.5,0.5]): kf = KFold(ptrain.shape[0], n_folds=10, shuffle=True, random_state=0) pred = np.zeros((ptrain.shape[0],n_classes)) for itrain, itest in kf: train = ptrain.iloc[itrain,:] test = ptrain.iloc[itest,:] ytrain, ytest = y[itrain], y[itest] clf = GenderAgeGroupProb().fit(train,by,n_smoothing,weights) pred[itest,:] = clf.predict_proba(test) return log_loss(y, pred)",No,4,7.0 "n_smoothing = [1,5,10,15,20,50,100] res = [score(train,['brand','model'],[s,s],[.5,.5]) for s in n_smoothing] plt.plot(n_smoothing, res) plt.title('Best score {:.5f} at n_smoothing = {}'.format(np.min(res),n_smoothing[np.argmin(res)])) plt.xlabel('n_smoothing')",No,4,81.0 "brand_weight = [0,0.2,0.4,0.6,0.8,1.0] res = [score(train,['brand','model'],[15,15],[b,1-b]) for b in brand_weight] plt.plot(brand_weight, res) plt.title('Best score {:.5f} at brand_weight = {}'.format(np.min(res),brand_weight[np.argmin(res)])) plt.xlabel('brand_weight')",No,5,81.0 "test = gatest.merge(phone[['device_id','brand','model']], how='left',on='device_id') test.head(3)",No,5,32.0 "clf = GenderAgeGroupProb().fit(train,['brand','model'],[15,15],[0.4,0.6]) pred = clf.predict_proba(test)",No,4,7.0 "pd.DataFrame(pred, index = test.device_id, columns=clf.classes_).to_csv('pbm_subm.csv', index=True)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

#from subprocess import check_output
#print(check_output([""ls"", ""../input""]).decode(""utf8""))

# Any results you write to the current directory are saved as output.",Yes,3,10.0 "def getVariables(value=1000): for var, obj in globals().items(): try: if(sys.getsizeof(obj) > value and not var.startswith(""_"")): print (""{0:30} {1:5}"".format(var, sys.getsizeof(obj))) except: continue",No,3,23.0 "def evalerror(preds, dtrain): labels = dtrain.get_label() assert len(preds) == len(labels) labels = labels.tolist() preds = preds.tolist() terms_to_sum = [(math.log(labels[i] + 1) - math.log(max(0,preds[i]) + 1)) ** 2.0 for i,pred in enumerate(labels)] return 'error', (sum(terms_to_sum) * (1.0/len(preds))) ** 0.5",No,5,84.0 "print ('Loading Test...') dtype_test = {'id':np.uint32, 'Semana': np.uint8, 'Agencia_ID': np.uint16, 'Canal_ID': np.uint8, 'Ruta_SAK': np.uint16, 'Cliente_ID': np.uint32, 'Producto_ID': np.uint16} %time test = pd.read_csv('../input/test.csv', usecols=dtype_test.keys(), dtype=dtype_test) test.head()",No,5,45.0 test.shape,No,5,58.0 "dtype = {'Semana': np.uint8, 'Agencia_ID': np.uint16, 'Canal_ID': np.uint8, 'Ruta_SAK': np.uint16, 'Cliente_ID': np.uint32, 'Producto_ID': np.uint16, 'Demanda_uni_equil': np.uint16} filename='../input/train.csv' %time train = pd.read_csv(filename, usecols=dtype.keys(), dtype=dtype, warn_bad_lines= True,engine='c') train.head()",No,4,45.0 "train = train[train[""Semana""]>8]
print (\'Training_Shape:\', train.shape)",No,4,14.0 "ids = test['id'] test = test.drop(['id'],axis = 1) y = train['Demanda_uni_equil'] X = train[test.columns.values] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1729) del(train) print ('Division_Set_Shapes:', X.shape, y.shape) print ('Validation_Set_Shapes:', X_train.shape, X_test.shape) del(X) del(y)",No,3,21.0 "params = {}
params[\'objective\'] = ""reg:linear""
params[\'eta\'] = 0.1
params[\'max_depth\'] = 5
params[\'subsample\'] = 0.8
params[\'colsample_bytree\'] = 0.6
params[\'silent\'] = True
#params[\'nthread\']= 4
params[\'booster\'] = ""gbtree""


test_preds = np.zeros(test.shape[0])
xg_train = xgb.DMatrix(X_train, label=y_train)
del(X_train)
del(y_train)
xg_test = xgb.DMatrix(X_test)
del(X_test)
watchlist = [(xg_train, \'train\')]",No,4,59.0 "num_rounds = 20 %time xgclassifier = xgb.train(params, xg_train, num_rounds, watchlist, feval = evalerror, early_stopping_rounds= 20, verbose_eval = 10) del(xg_train)",No,5,7.0 "preds = xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration) print ('RMSLE Score:', rmsle(y_test, preds)) ",No,3,49.0 "print ('RMSLE Score:', rmsle(y_test, preds)) del(preds) del(y_test)",No,5,49.0 "import numpy as np import pandas as pd import datetime act_train = pd.read_csv('../input/act_train.csv') act_test = pd.read_csv('../input/act_test.csv') people = pd.read_csv('../input/people.csv') people.sample(10)",Yes,4,8.0 "def process_dates(data,min_date): #min_date=data.min() min_date data=data.apply(lambda x: (datetime.datetime.strptime(x,""%Y-%m-%d"") -datetime.datetime.strptime(min_date,""%Y-%m-%d"")).days) data return data",No,4,8.0 "import pandas as pd
import numpy as np
import datetime
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import random
from operator import itemgetter
import time
import copy

random.seed(2016)


def create_feature_map(features):
outfile = open(\'xgb.fmap\', \'w\')
for i, feat in enumerate(features):
outfile.write(\'{0}\\t{1}\\tq\
\'.format(i, feat))
outfile.close()


def get_importance(gbm, features):
create_feature_map(features)
importance = gbm.get_fscore(fmap=\'xgb.fmap\')
importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
return importance


def intersect(a, b):
return list(set(a) & set(b))

def get_features(train, test):
trainval = list(train.columns.values)
testval = list(test.columns.values)
output = intersect(trainval, testval)
output.remove(\'people_id\')
return sorted(output)

def run_single(train, test, features, target, random_state=0):
eta = 1.3
max_depth = 3
subsample = 0.8
colsample_bytree = 0.8
start_time = time.time()

print(\'XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}\'.format(eta, max_depth, subsample, colsample_bytree))
params = {
""objective"": ""binary:logistic"",
""booster"" : ""gbtree"",
""eval_metric"": ""auc"",
""eta"": eta,
""tree_method"": \'exact\',
""max_depth"": max_depth,
""subsample"": subsample,
""colsample_bytree"": colsample_bytree,
""silent"": 1,
""seed"": random_state,
}
num_boost_round = 115
early_stopping_rounds = 10
test_size = 0.1

X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
print(\'Length train:\', len(X_train.index))
print(\'Length valid:\', len(X_valid.index))
y_train = X_train[target]
y_valid = X_valid[target]
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, \'train\'), (dvalid, \'eval\')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

print(""Validating..."")
check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration+1)
score = roc_auc_score(X_valid[target].values, check)
print(\'Check error value: {:.6f}\'.format(score))

imp = get_importance(gbm, features)
print(\'Importance array: \', imp)

print(""Predict test set..."")
test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration+1)

print(\'Training time: {} minutes\'.format(round((time.time() - start_time)/60, 2)))
return test_prediction.tolist()


def simple_load():

print(""Read people.csv..."")
people = pd.read_csv(""../input/people.csv"",
dtype={\'people_id\': np.str,
\'activity_id\': np.str,
\'char_38\': np.int32},
parse_dates=[\'date\'])

print(""Load train.csv..."")
train = pd.read_csv(""../input/act_train.csv"",
dtype={\'people_id\': np.str,
\'activity_id\': np.str,
\'outcome\': np.int8},
parse_dates=[\'date\'])

print(""Load test.csv..."")
test = pd.read_csv(""../input/act_test.csv"",
dtype={\'people_id\': np.str,
\'activity_id\': np.str},
parse_dates=[\'date\'])

print(""Process tables..."")
for table in [train, test]:
table[\'activity_category\'] = table[\'activity_category\'].str.lstrip(\'type \').astype(np.int32)
for i in range(1, 11):
table[\'char_\' + str(i)].fillna(\'type -999\', inplace=True)
table[\'char_\' + str(i)] = table[\'char_\' + str(i)].str.lstrip(\'type \').astype(np.int32)
people[\'year\'] = people[\'date\'].dt.year
people[\'month\'] = people[\'date\'].dt.month
people[\'day\'] = people[\'date\'].dt.day
people[\'weekday\'] = people[\'date\'].dt.weekday
people[\'weekend\'] = ((people.weekday == 0) | (people.weekday == 6)).astype(int)
people.drop(\'date\', axis=1, inplace=True)
people[\'group_1\'] = people[\'group_1\'].str.lstrip(\'group \').astype(np.int32)
for i in range(1, 10):
people[\'char_\' + str(i)] = people[\'char_\' + str(i)].str.lstrip(\'type \').astype(np.int32)
for i in range(10, 38):
people[\'char_\' + str(i)] = people[\'char_\' + str(i)].astype(np.int32)

print(""Merge..."")
train = train.merge(people, on=""people_id"", suffixes=(""_act"", """"))
test = test.merge(people, on=""people_id"", suffixes=(""_act"", """"))

# Set index to activity id
train = train.set_index(""activity_id"")
test = test.set_index(""activity_id"")
return train, test


def group_decision(train, test, only_certain=True):
# Exploit the leak revealed by Loiso and team to try and directly infer any labels that can be inferred
# https://www.kaggle.com/c/predicting-red-hat-business-value/forums/t/22807/0-987-kernel-now-available-seems-like-leakage

# Make a lookup dataframe, and copy those in first since we can be sure of them
lookup = train.groupby([""group_1"", ""date_act""], as_index=False)[""outcome""].mean()
test = pd.merge(test.reset_index(), lookup, how=""left"", on=[""group_1"", ""date_act""]).set_index(""activity_id"")

# Create some date filling columns that we\'ll use after we append
train[""date_act_fillfw""] = train[""date_act""]
train[""date_act_fillbw""] = train[""date_act""]

# Create some group filling columns for later use
train[""group_fillfw""] = train[""group_1""]
train[""group_fillbw""] = train[""group_1""]

# Put the two data sets together and sort
df = train.append(test)
df = df.sort_values(by=[""group_1"", ""date_act""])

# Fill the dates
df[""date_act_fillfw""] = df[""date_act_fillfw""].fillna(method=""ffill"")
df[""date_act_fillbw""] = df[""date_act_fillbw""].fillna(method=""bfill"")

# Fill labels
df[""outcome_fillfw""] = df[""outcome""].fillna(method=""ffill"")
df[""outcome_fillbw""] = df[""outcome""].fillna(method=""bfill"")

# Fill the groups
df[""group_fillfw""] = df[""group_fillfw""].fillna(method=""ffill"")
df[""group_fillbw""] = df[""group_fillbw""].fillna(method=""bfill"")

# Create int booleans for whether the fillers are from the same date
df[""fw_same_date""] = (df[""date_act_fillfw""] == df[""date_act""]).astype(int)
df[""bw_same_date""] = (df[""date_act_fillbw""] == df[""date_act""]).astype(int)

# Create int booleans for whether the fillers are in the same group
df[""fw_same_group""] = (df[""group_fillfw""] == df[""group_1""]).astype(int)
df[""bw_same_group""] = (df[""group_fillbw""] == df[""group_1""]).astype(int)

# Use the filled labels only if the labels were from the same group, unless we\'re at the end of the group
df[""interfill""] = (df[""outcome_fillfw""] *
df[""fw_same_group""] +
df[""outcome_fillbw""] *
df[""bw_same_group""]) / (df[""fw_same_group""] +
df[""bw_same_group""])

# If the labels are at the end of the group, cushion by 0.5
df[""needs cushion""] = (df[""fw_same_group""] * df[""bw_same_group""] - 1).abs()
df[""cushion""] = df[""needs cushion""] * df[""interfill""] * -0.1 + df[""needs cushion""] * 0.05
df[""interfill""] = df[""interfill""] + df[""cushion""]

# Fill everything
df[""outcome""] = df[""outcome""].fillna(df[""interfill""])

if only_certain == True:
# Drop anything we\'re not 100% certain of
df = df[(df[""outcome""] == 0.0) | (df[""outcome""] == 1.0)]

# Return outcomes to the original index
test[""outcome""] = df[""outcome""]

return test[""outcome""]

def xgboost_return(train,test,features):
print(""Process tables... "")
for table in [train, test]:
table[\'year\'] = table[\'date\'].dt.year
table[\'month\'] = table[\'date\'].dt.month
table[\'day\'] = table[\'date\'].dt.day
table[\'weekday\'] = table[\'date\'].dt.weekday
table[\'weekend\'] = ((table.weekday == 0) | (table.weekday == 6)).astype(int)
table.drop(\'date\', axis=1, inplace=True)
features.remove(\'date\')
features.remove(\'date_act\')
test[""extra outcomes""] = run_single(train,test,features,""outcome"")
return test[""extra outcomes""]

def model():

# Load in the data set simply by merging together
train, test = simple_load()

# Get features
features = get_features(train,test)

# Try to just infer the correct dates using the data leak
test[""outcome""] = group_decision(train, test, only_certain=False)

# Write the inferred predictions to a template
test.reset_index()[[""activity_id"", ""outcome""]].to_csv(""starter_template.csv"", index=False)

# Fill any missing rows with the mean of the whole column
test[""outcome""] = test[""outcome""].fillna(xgboost_return(train,test,features))

return test.reset_index()[[""activity_id"", ""outcome""]]


def main():

# Write a benchmark file to the submissions folder
model().to_csv(""submission.csv"", index=False)

if __name__ == ""__main__"":
main()",No,5,53.0 sub_df = pd.read_csv('../input/sample_submission.csv'),No,5,45.0 "# Size of the dataframe print(dataset.shape) # We can see that there are 15120 instances having 55 attributes #Learning : Data is loaded successfully as dimensions match the data description",No,5,58.0 "# Datatypes of the attributes print(dataset.dtypes) # Learning : Data types of all attributes has been inferred as int64",No,5,70.0 sub_df.head(),No,5,41.0 "# Statistical description pandas.set_option('display.max_columns', None) print(dataset.describe()) # Learning : # No attribute is missing as count is 15120 for all attributes. Hence, all rows can be used # Negative value(s) present in Vertical_Distance_To_Hydrology. Hence, some tests such as chi-sq cant be used. # Wilderness_Area and Soil_Type are one hot encoded. Hence, they could be converted back for some analysis # Attributes Soil_Type7 and Soil_Type15 can be removed as they are constant # Scales are not the same for all. Hence, rescaling and standardization may be necessary for some algos",No,3,23.0 train_df = pd.read_csv('../input/act_train.csv'),No,5,45.0 "train_df['activity_category'] = train_df['activity_category'].astype('category').cat.codes columns = ['char_'+str(i) for i in range(1,11)] train_df[columns] = train_df[columns].apply(lambda x: x.astype('category').cat.codes) train_df['date'] = pd.to_datetime(train_df['date']) train_df['day'] = train_df['date'].apply(lambda x:x.day) train_df['year'] = train_df['date'].apply(lambda x:x.year) train_df['month'] = train_df['date'].apply(lambda x:x.month) train_df = train_df.drop(['date'],axis = 1)",No,3,8.0 "# Skewness of the distribution print(dataset.skew()) # Values close to 0 show less skew # Several attributes in Soil_Type show a large skew. Hence, some algos may benefit if skew is corrected",No,4,40.0 "# Number of instances belonging to each class dataset.groupby('Cover_Type').size() # We see that all classes have an equal presence. No class re-balancing is necessary",No,5,60.0 "import numpy

# Correlation tells relation between two attributes.
# Correlation requires continous data. Hence, ignore Wilderness_Area and Soil_Type as they are binary

#sets the number of features considered
size = 10

#create a dataframe with only \'size\' features
data=dataset.iloc[:,:size]

#get the names of all the columns
cols=data.columns

# Calculates pearson co-efficient for all combinations
data_corr = data.corr()

# Set the threshold to select only only highly correlated attributes
threshold = 0.5

# List of pairs along with correlation above threshold
corr_list = []

#Search for the highly correlated pairs
for i in range(0,size): #for \'size\' features
for j in range(i+1,size): #avoid repetition
if (data_corr.iloc[i,j] >= threshold and data_corr.iloc[i,j] < 1) or (data_corr.iloc[i,j] < 0 and data_corr.iloc[i,j] <= -threshold):
corr_list.append([data_corr.iloc[i,j],i,j]) #store correlation and columns index

#Sort to show higher ones first
s_corr_list = sorted(corr_list,key=lambda x: -abs(x[0]))

#Print correlations and column names
for v,i,j in s_corr_list:
print (""%s and %s = %.2f"" % (cols[i],cols[j],v))

# Strong correlation is observed between the following pairs
# This represents an opportunity to reduce the feature set through transformations such as PCA",No,5,53.0 "#import plotting libraries import seaborn as sns import matplotlib.pyplot as plt # Scatter plot of only the highly correlated pairs for v,i,j in s_corr_list: sns.pairplot(dataset, hue=""Cover_Type"", size=6, x_vars=cols[i],y_vars=cols[j] ) plt.show() #The plots show to which class does a point belong to. The class distribution overlaps in the plots. #Hillshade patterns give a nice ellipsoid patterns with each other #Aspect and Hillshades attributes form a sigmoid pattern #Horizontal and vertical distance to hydrology give an almost linear pattern.",No,5,33.0 "# We will visualize all the attributes using Violin Plot - a combination of box and density plots #names of all the attributes cols = dataset.columns #number of attributes (exclude target) size = len(cols)-1 #x-axis has target attribute to distinguish between classes x = cols[size] #y-axis shows values of an attribute y = cols[0:size] #Plot violin for all attributes for i in range(0,size): sns.violinplot(data=dataset,x=x,y=y[i]) plt.show() #Elevation is has a separate distribution for most classes. Highly correlated with the target and hence an important attribute #Aspect contains a couple of normal distribution for several classes #Horizontal distance to road and hydrology have similar distribution #Hillshade 9am and 12pm display left skew #Hillshade 3pm is normal #Lots of 0s in vertical distance to hydrology #Wilderness_Area3 gives no class distinction. As values are not present, others gives some scope to distinguish #Soil_Type, 1,5,8,9,12,14,18-22, 25-30 and 35-40 offer class distinction as values are not present for many classes",No,5,33.0 "# Group one-hot encoded variables of a category into one single variable

#names of all the columns
cols = dataset.columns

#number of rows=r , number of columns=c
r,c = dataset.shape

#Create a new dataframe with r rows, one column for each encoded category, and target in the end
data = pandas.DataFrame(index=numpy.arange(0, r),columns=[\'Wilderness_Area\',\'Soil_Type\',\'Cover_Type\'])

#Make an entry in \'data\' for each r as category_id, target value
for i in range(0,r):
w=0;
s=0;
# Category1 range
for j in range(10,14):
if (dataset.iloc[i,j] == 1):
w=j-9 #category class
break
# Category2 range
for k in range(14,54):
if (dataset.iloc[i,k] == 1):
s=k-13 #category class
break
#Make an entry in \'data\' for each r as category_id, target value
data.iloc[i]=[w,s,dataset.iloc[i,c-1]]

#Plot for Category1
sns.countplot(x=""Wilderness_Area"", hue=""Cover_Type"", data=data)
plt.show()
#Plot for Category2
plt.rc(""figure"", figsize=(25, 10))
sns.countplot(x=""Soil_Type"", hue=""Cover_Type"", data=data)
plt.show()

#(right-click and open the image in a new window for larger size)
#WildernessArea_4 has a lot of presence for cover_type 4. Good class distinction
#WildernessArea_3 has not much class distinction
#SoilType 1-6,10-14,17, 22-23, 29-33,35,38-40 offer lot of class distinction as counts for some are very high",No,3,20.0 "#Removal list initialize rem = [] #Add constant columns as they don't help in prediction process for c in dataset.columns: if dataset[c].std() == 0: #standard deviation is zero rem.append(c) #drop the columns dataset.drop(rem,axis=1,inplace=True) print(rem) #Following columns are dropped",No,5,10.0 train_df.corr().outcome,No,5,40.0 "rank_df = pandas.DataFrame(data=[x[7] for x in X_all_add],columns=cols[:c-1]) _ = rank_df.boxplot(rot=90) #Below plot summarizes the rankings according to the standard feature selection techniques #Top ranked attributes are ... first 10 attributes, Wilderness_Area1,4 ...Soil_Type 3,4,10,38-40",No,3,12.0 "rank_df = pandas.DataFrame(data=[x[7] for x in X_all_add],columns=cols[:c-1])
med = rank_df.median()
print(med)
#Write medians to output file for exploratory study on ML algorithms
with open(""median.csv"", ""w"") as subfile:
subfile.write(""Column,Median\
"")
subfile.write(med.to_string())",No,4,40.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output([""ls"", ""../input""]).decode(""utf8""))

# Any results you write to the current directory are saved as output.

print (\'Loading input files..\')
print ()
people = pd.read_csv(\'../input/people.csv\',
dtype={\'people_id\': np.str,
\'activity_id\': np.str,
\'char_38\': np.int32},
parse_dates=[\'date\'])
train = pd.read_csv(r\'../input/act_train.csv\',
dtype={\'people_id\': np.str,
\'activity_id\': np.str,
\'outcome\': np.int8},
parse_dates=[\'date\'])
test = pd.read_csv(\'../input/act_test.csv\',
dtype={\'people_id\': np.str,
\'activity_id\': np.str},
parse_dates=[\'date\'])

missing_values = []

print (\'Train set features\')
print (\'------------------\')
for col in train:
unique = train[col].unique()
print (str(col) + \' has \' + str(unique.size) + \' unique values\')

if (True in pd.isnull(unique)):
print (str(col) + \' has \' + str(pd.isnull(train[col]).sum()) + \' missing values\')
print ()

print ()

print (\'Processing the datasets..\')
print ()
for data in [train,test]:
for i in range(1,11):
data[\'char_\'+str(i)].fillna(\'type -1\', inplace = \'true\')
data[\'char_\'+str(i)] = data[\'char_\'+str(i)].str.lstrip(\'type \').astype(np.int32)

data[\'activity_category\'] = data[\'activity_category\'].str.lstrip(\'type \').astype(np.int32)

data[\'year\'] = data[\'date\'].dt.year
data[\'month\'] = data[\'date\'].dt.month
data[\'day\'] = data[\'date\'].dt.day
data.drop(\'date\', axis=1, inplace=True)

for i in range(1,10):
people[\'char_\' + str(i)] = people[\'char_\' + str(i)].str.lstrip(\'type \').astype(np.int32)
for i in range(10, 38):
people[\'char_\' + str(i)] = people[\'char_\' + str(i)].astype(np.int32)

people[\'group_1\'] = people[\'group_1\'].str.lstrip(\'group \').astype(np.int32)
people[\'year\'] = people[\'date\'].dt.year
people[\'month\'] = people[\'date\'].dt.month
people[\'day\'] = people[\'date\'].dt.day
people.drop(\'date\', axis=1, inplace=True)

print (\'Merging the datasets..\')
print ()

train = pd.merge(train, people, how=\'left\', on=\'people_id\', left_index=True)
train.fillna(-1, inplace=True)
test = pd.merge(test, people, how=\'left\', on=\'people_id\', left_index=True)
test.fillna(-1, inplace=True)

train = train.drop([\'people_id\'], axis=1)

#Separate label and data
Y = train[\'outcome\']
X = train.drop([\'outcome\'], axis=1)
X = X.iloc[:,1:]
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=96)

#print(""cv"")
#scores = cross_val_score(rfc, X, Y, cv=4)
#print (""Mean accuracy of Random Forest: "" + scores.mean())
rfc = rfc.fit(X, Y)
#drop the people_id
test = test.drop([\'people_id\'], axis=1)
# Get the test data features, skipping the first column \'PassengerId\'
test_x = test.iloc[:, 1:]


# Predict the outcome values for the test data
test_y = list(map(int, rfc.predict(test_x)))
#file for submission
test[\'outcome\'] = test_y
test[[\'activity_id\', \'outcome\']] \\
.to_csv(\'results.csv\', index=False)",No,5,53.0 "import pandas as pd import numpy as np %matplotlib inline import seaborn as sns import matplotlib.pyplot as plt import os from sklearn.preprocessing import LabelEncoder from scipy.sparse import csr_matrix, hstack from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import StratifiedKFold from sklearn.metrics import log_loss",No,5,23.0 "datadir = '../input' gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'), index_col='device_id') gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'), index_col = 'device_id') phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv')) # Get rid of duplicate device ids in phone phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id') events = pd.read_csv(os.path.join(datadir,'events.csv'), parse_dates=['timestamp'], index_col='event_id') appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'), usecols=['event_id','app_id','is_active'], dtype={'is_active':bool}) applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))",No,4,45.0 "brandencoder = LabelEncoder().fit(phone.phone_brand) phone['brand'] = brandencoder.transform(phone['phone_brand']) gatrain['brand'] = phone['brand'] gatest['brand'] = phone['brand'] Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]), (gatrain.trainrow, gatrain.brand))) Xte_brand = csr_matrix((np.ones(gatest.shape[0]), (gatest.testrow, gatest.brand))) print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))",No,4,20.0 "people_df = pd.read_csv('../input/people.csv') people_df.group_1.unique().shape",No,3,45.0 "people_df = pd.read_csv('../input/people.csv') columns = ['char_'+str(i) for i in range(1,10)] people_df[columns] = people_df[columns].apply(lambda x: x.astype('category').cat.codes) people_df['group_1'] = people_df['group_1'].astype('category').cat.codes people_df['date'] = pd.to_datetime(people_df['date']) people_df['day'] = people_df['date'].apply(lambda x:x.day) people_df['year'] = people_df['date'].apply(lambda x:x.year) people_df['month'] = people_df['date'].apply(lambda x:x.month) people_df = people_df.drop(['date'],axis = 1) people_df = people_df.set_index(people_df['people_id']) people_df.head()",No,4,8.0 "train_X = train_df.join(people_df,on = 'people_id', rsuffix='_people')",No,5,32.0 "Y = train_X['outcome'] X = train_X.drop(['outcome','people_id','people_id_people','activity_id'],axis = 1)",No,5,21.0 "#from sklearn.ensemble import RandomForestClassifier #clf = RandomForestClassifier(n_estimators = 10) #clf = clf.fit(X, Y) import xgboost as xgb gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(X, Y)",No,5,7.0 "test_df = pd.read_csv('../input/act_test.csv') test_df['activity_category'] = test_df['activity_category'].astype('category').cat.codes columns = ['char_'+str(i) for i in range(1,11)] test_df[columns] = test_df[columns].apply(lambda x: x.astype('category').cat.codes) test_df['date'] = pd.to_datetime(test_df['date']) test_df['day'] = test_df['date'].apply(lambda x:x.day) test_df['year'] = test_df['date'].apply(lambda x:x.year) test_df['month'] = test_df['date'].apply(lambda x:x.month) test_df = test_df.drop(['date'],axis = 1)",No,3,8.0 "test_X = test_df.join(people_df,on = 'people_id', rsuffix='_people') X = test_X.drop(['people_id','people_id_people','activity_id'],axis = 1) #output = clf.predict(X) output = gbm.predict(X)",No,4,48.0 "test_df['outcome'] = output test_df.to_csv('submission.csv',columns = ['activity_id','outcome'],index = False)",No,5,25.0 import pandas as pd,No,5,22.0 "train = pd.read_csv(""../input/train.csv"") test = pd.read_csv(""../input/test.csv"") submission = pd.read_csv(""../input/sampleSubmission.csv"")",No,5,45.0 "print(""Train dataset:"") print(train.head()) print(""Test dataset:"") print(test.head()) print(""Sample submission:"") print(submission.head())",No,5,41.0 print(train.describe()),No,5,40.0 "mean = train.describe()[""count""][""mean""]",No,5,40.0 "m = phone.phone_brand.str.cat(phone.device_model) modelencoder = LabelEncoder().fit(m) phone['model'] = modelencoder.transform(m) gatrain['model'] = phone['model'] gatest['model'] = phone['model'] Xtr_model = csr_matrix((np.ones(gatrain.shape[0]), (gatrain.trainrow, gatrain.model))) Xte_model = csr_matrix((np.ones(gatest.shape[0]), (gatest.testrow, gatest.model))) print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))",No,4,20.0 "appencoder = LabelEncoder().fit(appevents.app_id) appevents['app'] = appencoder.transform(appevents.app_id) napps = len(appencoder.classes_) deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True) .groupby(['device_id','app'])['app'].agg(['size']) .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True) .merge(gatest[['testrow']], how='left', left_index=True, right_index=True) .reset_index()) deviceapps.head()",Yes,3,20.0 "d = deviceapps.dropna(subset=['trainrow']) Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)), shape=(gatrain.shape[0],napps)) d = deviceapps.dropna(subset=['testrow']) Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), shape=(gatest.shape[0],napps)) print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))",Yes,4,17.0 "applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())] applabels['app'] = appencoder.transform(applabels.app_id) labelencoder = LabelEncoder().fit(applabels.label_id) applabels['label'] = labelencoder.transform(applabels.label_id) nlabels = len(labelencoder.classes_)",No,4,20.0 "devicelabels = (deviceapps[['device_id','app']] .merge(applabels[['app','label']]) .groupby(['device_id','label'])['app'].agg(['size']) .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True) .merge(gatest[['testrow']], how='left', left_index=True, right_index=True) .reset_index()) devicelabels.head()",Yes,4,32.0 "d = devicelabels.dropna(subset=['trainrow']) Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), shape=(gatrain.shape[0],nlabels)) d = devicelabels.dropna(subset=['testrow']) Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), shape=(gatest.shape[0],nlabels)) print('Labels data: train shape {}, test shape {}'.format(Xtr_label.shape, Xte_label.shape))",No,4,17.0 "Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label), format='csr') Xtest = hstack((Xte_brand, Xte_model, Xte_app, Xte_label), format='csr') print('All features: train shape {}, test shape {}'.format(Xtrain.shape, Xtest.shape))",No,4,11.0 "targetencoder = LabelEncoder().fit(gatrain.gender) y = targetencoder.transform(gatrain.gender) nclasses = len(targetencoder.classes_)",No,5,20.0 "clf = LogisticRegression(C=0.08)#, multi_class='multinomial',solver='lbfgs') clf.fit(Xtrain[70001:], y[70001:]) pred = pd.DataFrame(clf.predict_proba(Xtrain[70001:]), index=gatrain.iloc[70001:].index, columns=targetencoder.classes_) pred.head()",Yes,3,7.0 "pred.to_csv('test_gender.csv',index=True)",No,5,25.0 "import sys import os import math import numpy as np import pandas as pd import seaborn as sns from scipy.stats import norm, gumbel_r from scipy.optimize import linprog import matplotlib import matplotlib.pyplot as plt from sklearn.utils.extmath import cartesian %matplotlib inline",No,5,23.0 "submission.to_csv(""submission.csv"", index=False)",No,5,25.0 "def plot_bag_weight_distributions(bags, size=10000): plot_distributions(bags, create_bag_weight_sampler, size=size, fit=norm) def plot_distributions(bags, sampler_builder, size=10000, fit=None): num_plots = len(bags) num_cols = int(round(math.sqrt(num_plots))) num_rows = (num_plots // num_cols) num_rows = num_rows if num_plots % num_cols == 0 else num_rows + 1 f, axes = plt.subplots(num_rows, num_cols) axes = axes.reshape(-1) for i in range(num_plots): current_bag = bags[i] current_bag_sampler, current_bag_name = sampler_builder(current_bag) current_sample = current_bag_sampler(size) print(""{}: mean={} | std={}"".format(current_bag_name, np.mean(current_sample), np.std(current_sample))) current_axis = axes[i] sns.distplot(current_sample, ax=current_axis, fit=fit, kde=False) current_axis.set_title(current_bag_name) current_axis.set_yticklabels([]) plt.tight_layout() plt.show() single_gift_bags = [ {""horse"": 1}, {""ball"": 1}, {""bike"": 1}, {""train"": 1}, {""coal"": 1}, {""book"": 1}, {""doll"": 1}, {""blocks"": 1}, {""gloves"": 1} ] plot_bag_weight_distributions(single_gift_bags)",No,5,33.0 "example_bags = [ {""horse"": 1, ""ball"": 2}, {""train"": 3, ""bike"": 1}, {""coal"": 2, ""book"": 2}, {""gloves"": 12, ""book"": 12}, ] plot_bag_weight_distributions(example_bags)",No,5,33.0 "def plot_bag_utility_distributions(bags, size=10000, fit=norm):
plot_distributions(bags, create_bag_utility_sampler, size=size, fit=fit)

def create_bag_utility_sampler(bag):
bag_weight_sampler, bag_name = create_bag_weight_sampler(bag)
def bag_utility_sampler(size=1):
samples = bag_weight_sampler(size)
samples[samples > 50] = 0
return samples
return bag_utility_sampler, bag_name

bag = { ""horse"": 2, ""ball"": 19 }
bag_utility_sampler, name = create_bag_utility_sampler(bag)
print(""Sampling utility from bag {}: {}\
"".format(name, bag_utility_sampler(9)))
plot_bag_utility_distributions(example_bags)",Yes,4,33.0 "def plot_score_distribution(bags, num_tries=60, size=10000, fit=norm, extremal_fit=gumbel_r):
scores = np.zeros(size)
for i, bag in enumerate(bags):
current_bag_sampler, _ = create_bag_utility_sampler(bag)
scores += current_bag_sampler(size)
score_mean, score_std = np.mean(scores), np.std(scores)
print(""Scores: mean = {:0.2f} | std = {:0.2f}"".format(score_mean, score_std))
sns.distplot(scores, fit=fit, kde=False)

plot_extreme_value_distribution(scores, num_tries)
plt.title(""Score distribution / submission distribution with {} tries"".format(num_tries))
plt.show()

def plot_extreme_value_distribution(scores, num_tries, size=10000):
samples = np.max(np.random.choice(scores, size=(size, num_tries)), axis=1)
sns.distplot(samples, fit=gumbel_r, kde=False)
expected_score = np.mean(samples)
plt.axvline(expected_score, color=\'r\')
print(""Expected score after {} trials: {:0.2f}"".format(num_tries, expected_score))

plot_score_distribution(example_bags)",Yes,4,33.0 "def drop_duplicate(candidate_bags, distributions): df = pd.DataFrame(data=np.hstack((candidate_bags, distributions)), columns=gifts + [""mean"", ""var""]) df.drop_duplicates(subset=gifts, inplace=True) return df[gifts].values, df[[""mean"", ""var""]].values candidate_bags = np.vstack([mixed_item_candiadte_bags, low_weight_item_candidate_bags]) bag_weight_distributions = np.vstack([mixed_item_bag_weight_distributions, low_weight_item_bag_weight_distributions]) print(""Combined candiadte bags: {}"".format(candidate_bags.shape)) candidate_bags, bag_weight_distributions = drop_duplicate(candidate_bags, bag_weight_distributions) print(""Final candidate bags without duplicates: {}"".format(candidate_bags.shape))",Yes,3,11.0 pd.read_csv('submission_20.csv').head(),No,5,41.0 "import numpy as np import pandas as pd from subprocess import check_output print(check_output([""ls"", ""../input""]).decode(""utf8""))",No,5,88.0 "train = pd.read_json(""../input/train.json"") train.info()",No,5,44.0 "X = train.drop(""interest_level"", 1) Y = train[""interest_level""].astype(""category"")",No,5,21.0 "X[""street_address""] = X[""street_address""].astype(\'category\').cat.codes
X[""created""] = X[""created""].astype(\'category\').cat.codes
X[""building_id""] = X[""building_id""].astype(\'category\').cat.codes
X[""description""] = X[""description""].astype(\'category\').cat.codes
X[""display_address""] = X[""display_address""].astype(\'category\').cat.codes
X[""manager_id""] = X[""manager_id""].astype(\'category\').cat.codes",No,5,8.0 "features = [""bathrooms"", ""bedrooms"", ""latitude"", ""longitude"", ""price"", ""street_address"", ""created"", ""description"", ""display_address""] X = X[features]",No,4,21.0 "from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)",No,5,13.0 "from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

classifiers = [
KNeighborsClassifier(3),
DecisionTreeClassifier(),
RandomForestClassifier(),
AdaBoostClassifier(),
GradientBoostingClassifier(),
GaussianNB(),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis()]

# Logging for Visual Comparison
log_cols=[""Classifier"", ""Accuracy"", ""Log Loss""]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
clf.fit(X_train, y_train)
name = clf.__class__.__name__

print(""=""*30)
print(name)

print(\'****Results****\')
train_predictions = clf.predict(X_test)
acc = accuracy_score(y_test, train_predictions)
print(""Accuracy: {:.4%}"".format(acc))

train_predictions = clf.predict_proba(X_test)
ll = log_loss(y_test, train_predictions)
print(""Log Loss: {}"".format(ll))

log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
log = log.append(log_entry)

print(""=""*30)",Yes,3,4.0 "test = pd.read_json(""../input/test.json"") index = test[""listing_id""] test = test[features]",No,5,44.0 "olist = list(test.select_dtypes(['object'])) for col in olist: test[col] = test[col].astype('category').cat.codes",No,5,8.0 "favorite_clf = LinearDiscriminantAnalysis() favorite_clf.fit(X_train, y_train) test_predictions = favorite_clf.predict_proba(test)",Yes,3,7.0 "submission = pd.DataFrame({
""listing_id"": index,
""high"": test_predictions[:,0],
""medium"":test_predictions[:,2],
""low"":test_predictions[:,1]
})

columnsTitles=[""listing_id"",""high"",""medium"",""low""]
submission=submission.reindex(columns=columnsTitles)
submission.to_csv(\'submission.csv\', index=False)",Yes,4,25.0 "from keras.models import Sequential from keras.layers import Dense",No,5,22.0 "from keras.utils import np_utils from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() encoder.fit(Y) encoded_Y = encoder.transform(Y) # convert integers to dummy variables (i.e. one hot encoded) dummy_y = np_utils.to_categorical(encoded_Y)",No,5,20.0 "# Compile model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # Fit the model model.fit(X.values, dummy_y, nb_epoch=10, batch_size=10)",No,4,7.0 "import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from subprocess import check_output from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import log_loss print(check_output([""ls"", ""../input""]).decode(""utf8""))",No,5,88.0 "sub = pd.read_csv('../input/sample_submission.csv') sub.head()",Yes,4,45.0 "df = pd.read_json('../input/train.json') df.tail()",Yes,4,44.0 "df['address'] = df['display_address'].astype('category').cat.codes df['street_address'] = df['street_address'].astype('category').cat.codes df['building_id'] = df['building_id'].astype('category').cat.codes df['manager_id'] = df['manager_id'].astype('category').cat.codes df['num_features'] = df['features'].apply(len) df['created'] = pd.to_datetime(df['created']) df['created_year'] = df['created'].dt.year.astype('category').cat.codes df['created_month'] = df['created'].dt.month.astype('category').cat.codes df['len_description'] = df['description'].apply(lambda x: len(x.split(' '))) df['num_pics'] = df['photos'].apply(len)",No,5,8.0 "new_feat = ['price','address','manager_id','building_id', 'num_features','created_year','created_month', 'len_description','latitude','longitude','num_pics'] #new_feat = ['price','latitude','longitude','num_pics', # 'num_features','created_year','created_month','len_description'] X = df[new_feat].fillna(0) y = df['interest_level'].astype('category').cat.codes X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=9) X.tail()",Yes,3,21.0 "clf1 = GradientBoostingClassifier(n_estimators=200, max_depth=9) clf2 = AdaBoostClassifier(n_estimators=200) clf3 = RandomForestClassifier(n_estimators=300) estimators = [('gb', clf1), ('ab', clf2), ('rf', clf3)] vclf = VotingClassifier(estimators=estimators, voting='soft', n_jobs= -1) vclf.fit(X_train, y_train) y_val_pred = vclf.predict_proba(X_val) log_loss(y_val, y_val_pred)",Yes,3,4.0 "X_train = df[new_feat].fillna(0) y_train = df['interest_level'] vclf.fit(X_train, y_train) df2 = pd.read_json('../input/test.json') df2['address'] = df2['display_address'].astype('category').cat.codes df2['street_address'] = df2['street_address'].astype('category').cat.codes df2['building_id'] = df2['building_id'].astype('category').cat.codes df2['manager_id'] = df2['manager_id'].astype('category').cat.codes df2['num_features'] = df2['features'].apply(len) df2['created'] = pd.to_datetime(df2['created']) df2['created_year'] = df2['created'].dt.year.astype('category').cat.codes df2['created_month'] = df2['created'].dt.month.astype('category').cat.codes df2['len_description'] = df2['description'].apply(lambda x: len(x.split(' '))) df2['num_pics'] = df2['photos'].apply(len) X = df2[new_feat].fillna(0) y = vclf.predict_proba(X)",Yes,3,8.0 "sub = pd.read_csv('submissionVoting.csv') sub.head()",Yes,4,45.0 "from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import log_loss",No,5,22.0 "df = pd.read_json(open(""../input/train.json"", ""r""))",No,5,44.0 df.head(),No,5,41.0 "df[""num_photos""] = df[""photos""].apply(len) df[""num_features""] = df[""features""].apply(len) df[""num_description_words""] = df[""description""].apply(lambda x: len(x.split("" ""))) df[""created""] = pd.to_datetime(df[""created""]) df[""created_year""] = df[""created""].dt.year df[""created_month""] = df[""created""].dt.month df[""created_day""] = df[""created""].dt.day",No,5,8.0 "num_feats = [""bathrooms"", ""bedrooms"", ""latitude"", ""longitude"", ""price"", ""num_photos"", ""num_features"", ""num_description_words"", ""created_year"", ""created_month"", ""created_day""] X = df[num_feats] y = df[""interest_level""] X.head()",Yes,4,21.0 "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)",No,5,13.0 "clf = RandomForestClassifier(n_estimators=1000) clf.fit(X_train, y_train) y_val_pred = clf.predict_proba(X_val) log_loss(y_val, y_val_pred)",Yes,3,4.0 "df = pd.read_json(open(""../input/test.json"", ""r"")) df[""num_photos""] = df[""photos""].apply(len) df[""num_features""] = df[""features""].apply(len) df[""num_description_words""] = df[""description""].apply(lambda x: len(x.split("" ""))) df[""created""] = pd.to_datetime(df[""created""]) df[""created_year""] = df[""created""].dt.year df[""created_month""] = df[""created""].dt.month df[""created_day""] = df[""created""].dt.day X = df[num_feats] y = clf.predict_proba(X)",Yes,4,8.0 "import numpy as np import pandas as pd # Look! No scikit learn!",No,5,22.0 "df_train = pd.read_json(open(""../input/train.json"", ""r"")) df_train.set_index(""listing_id"", inplace=True) df_test = pd.read_json(open(""../input/test.json"", ""r"")) df_test.set_index(""listing_id"", inplace=True) # We will work with a concatenation of the two, then split after the scaling. df = pd.concat([df_train, df_test])",Yes,3,44.0 "df[""num_photos""] = df[""photos""].apply(len) df[""num_features""] = df[""features""].apply(len) df[""num_description_words""] = df[""description""].apply(lambda x: len(x.split("" ""))) df[""created""] = pd.to_datetime(df[""created""]) #df[""created_year""] = df[""created""].dt.year df[""created_month""] = df[""created""].dt.month df[""created_day""] = df[""created""].dt.day",No,5,8.0 "df[""logprice""] = np.log(df.price)",No,5,8.0 "df.loc[df.bathrooms == 112, ""bathrooms""] = 1",No,5,14.0 "numeric_feat = [""bathrooms"", ""bedrooms"", ""latitude"", ""longitude"", ""logprice"", ""num_photos"", ""num_features"", ""num_description_words"", ""created_month"", ""created_day""] for col in numeric_feat: df[col] -= df[col].min() df[col] /= df[col].max()",No,5,18.0 "X_train = df.loc[df.interest_level.notnull(), numeric_feat] y_train = pd.get_dummies(df_train[[""interest_level""]], prefix="""") y_train = y_train[[""_high"", ""_medium"", ""_low""]] # Set the order according to submission X_test = df.loc[df.interest_level.isnull(), numeric_feat]",Yes,3,14.0 "## A dead simple neural network class in Python+Numpy. Plain SGD, and no regularization. def sigmoid(X): return 1.0 / ( 1.0 + np.exp(-X) ) def softmax(X): _sum = np.exp(X).sum() return np.exp(X) / _sum class neuralnet(object): def __init__(self, num_input, num_hidden, num_output): self._W1 = (np.random.random_sample((num_input, num_hidden)) - 0.5).astype(np.float32) self._b1 = np.zeros((1, num_hidden)).astype(np.float32) self._W2 = (np.random.random_sample((num_hidden, num_output)) - 0.5).astype(np.float32) self._b2 = np.zeros((1, num_output)).astype(np.float32) def forward(self,X): net1 = np.matmul( X, self._W1 ) + self._b1 y = sigmoid(net1) net2 = np.matmul( y, self._W2 ) + self._b2 z = softmax(net2) return z,y def backpropagation(self, X, target, eta): z, y = self.forward(X) d2 = (z - target) d1 = y*(1.0-y) * np.matmul(d2, self._W2.T) # The updates are done within this method. This more or less implies # utpdates with Stochastic Gradient Decent. Let's fix that later. # TODO: Support for full batch and mini-batches etc. self._W2 -= eta * np.matmul(y.T,d2) self._W1 -= eta * np.matmul(X.reshape((-1,1)),d1) self._b2 -= eta * d2 self._b1 -= eta * d1",No,5,4.0 "# Some hyper-parameters to tune. num_hidden = 17 # I think I get about 1 epoch/sec with this size on the docker instance n_epochs = 100 eta = 0.01",No,5,59.0 "import os import sys import operator import numpy as np import pandas as pd from scipy import sparse import xgboost as xgb from sklearn import model_selection, preprocessing, ensemble from sklearn.metrics import log_loss from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer ",No,5,22.0 "data_path = ""../input/"" train_file = data_path + ""train.json"" test_file = data_path + ""test.json"" train_df = pd.read_json(train_file) test_df = pd.read_json(test_file) print(train_df.shape) print(test_df.shape) ",Yes,3,44.0 "# count of photos # train_df[""num_photos""] = train_df[""photos""].apply(len) test_df[""num_photos""] = test_df[""photos""].apply(len) # count of ""features"" # train_df[""num_features""] = train_df[""features""].apply(len) test_df[""num_features""] = test_df[""features""].apply(len) # count of words present in description column # train_df[""num_description_words""] = train_df[""description""].apply(lambda x: len(x.split("" ""))) test_df[""num_description_words""] = test_df[""description""].apply(lambda x: len(x.split("" ""))) # convert the created column to datetime object so as to extract more features train_df[""created""] = pd.to_datetime(train_df[""created""]) test_df[""created""] = pd.to_datetime(test_df[""created""]) # Let us extract some features like year, month, day, hour from date columns # train_df[""created_year""] = train_df[""created""].dt.year test_df[""created_year""] = test_df[""created""].dt.year train_df[""created_month""] = train_df[""created""].dt.month test_df[""created_month""] = test_df[""created""].dt.month train_df[""created_day""] = train_df[""created""].dt.day test_df[""created_day""] = test_df[""created""].dt.day train_df[""created_hour""] = train_df[""created""].dt.hour test_df[""created_hour""] = test_df[""created""].dt.hour # adding all these new features to use list # features_to_use.extend([""num_photos"", ""num_features"", ""num_description_words"",""created_year"", ""created_month"", ""created_day"", ""listing_id"", ""created_hour""]) ",No,4,8.0 "categorical = [""display_address"", ""manager_id"", ""building_id"", ""street_address""]
for f in categorical:
if train_df[f].dtype==\'object\':
#print(f)
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train_df[f].values) + list(test_df[f].values))
train_df[f] = lbl.transform(list(train_df[f].values))
test_df[f] = lbl.transform(list(test_df[f].values))
features_to_use.append(f)
",No,5,20.0 "train_df[\'features\'] = train_df[""features""].apply(lambda x: "" "".join([""_"".join(i.split("" "")) for i in x]))
test_df[\'features\'] = test_df[""features""].apply(lambda x: "" "".join([""_"".join(i.split("" "")) for i in x]))
print(train_df[""features""].head())
tfidf = CountVectorizer(stop_words=\'english\', max_features=200)
tr_sparse = tfidf.fit_transform(train_df[""features""])
te_sparse = tfidf.transform(test_df[""features""])
",Yes,3,8.0 "train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr() test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr() target_num_map = {'high':0, 'medium':1, 'low':2} train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x])) print(train_X.shape, test_X.shape) ",Yes,3,11.0 "import os import sys import operator import numpy as np import pandas as pd from scipy import sparse import xgboost as xgb from sklearn import model_selection, preprocessing, ensemble from sklearn.metrics import log_loss from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer",No,5,22.0 "data_path = ""../input/"" train_file = data_path + ""train.json"" test_file = data_path + ""test.json"" train_df = pd.read_json(train_file) test_df = pd.read_json(test_file) print(train_df.shape) print(test_df.shape)",Yes,4,44.0 "# count of photos # train_df[""num_photos""] = train_df[""photos""].apply(len) test_df[""num_photos""] = test_df[""photos""].apply(len) # count of ""features"" # train_df[""num_features""] = train_df[""features""].apply(len) test_df[""num_features""] = test_df[""features""].apply(len) # count of words present in description column # train_df[""num_description_words""] = train_df[""description""].apply(lambda x: len(x.split("" ""))) test_df[""num_description_words""] = test_df[""description""].apply(lambda x: len(x.split("" ""))) # convert the created column to datetime object so as to extract more features train_df[""created""] = pd.to_datetime(train_df[""created""]) test_df[""created""] = pd.to_datetime(test_df[""created""]) # Let us extract some features like year, month, day, hour from date columns # train_df[""created_year""] = train_df[""created""].dt.year test_df[""created_year""] = test_df[""created""].dt.year train_df[""created_month""] = train_df[""created""].dt.month test_df[""created_month""] = test_df[""created""].dt.month train_df[""created_day""] = train_df[""created""].dt.day test_df[""created_day""] = test_df[""created""].dt.day train_df[""created_hour""] = train_df[""created""].dt.hour test_df[""created_hour""] = test_df[""created""].dt.hour # adding all these new features to use list # features_to_use.extend([""num_photos"", ""num_features"", ""num_description_words"",""created_year"", ""created_month"", ""created_day"", ""listing_id"", ""created_hour""])",No,4,8.0 "categorical = [""display_address"", ""manager_id"", ""building_id"", ""street_address""]
for f in categorical:
if train_df[f].dtype==\'object\':
#print(f)
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train_df[f].values) + list(test_df[f].values))
train_df[f] = lbl.transform(list(train_df[f].values))
test_df[f] = lbl.transform(list(test_df[f].values))
features_to_use.append(f)",No,5,20.0 "train_df[\'features\'] = train_df[""features""].apply(lambda x: "" "".join([""_"".join(i.split("" "")) for i in x]))
test_df[\'features\'] = test_df[""features""].apply(lambda x: "" "".join([""_"".join(i.split("" "")) for i in x]))
print(train_df[""features""].head())
tfidf = CountVectorizer(stop_words=\'english\', max_features=200)
tr_sparse = tfidf.fit_transform(train_df[""features""])
te_sparse = tfidf.transform(test_df[""features""])",Yes,4,8.0 "train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr() test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr() target_num_map = {'high':0, 'medium':1, 'low':2} train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x])) print(train_X.shape, test_X.shape)",Yes,3,11.0 "out_df.to_csv(""xgb_starter2.csv"", index=False)",No,5,25.0 "df = pd.read_json(open(""../input/train.json"", \'r\'))
df.head()",Yes,4,44.0 "df['num_photos'] = df['photos'].apply(len) df['num_features'] = df['features'].apply(len) df['num_description_words'] = df['description'].apply(lambda x: len(x.split(' '))) df['created'] = pd.to_datetime(df['created']) df['created_year'] = df['created'].dt.year df['created_month'] = df['created'].dt.month df['created_day'] = df['created'].dt.day df['created_hour'] = df['created'].dt.hour df['created_minute'] = df['created'].dt.minute",No,5,8.0 "# price: removing values in 99 percentile df = remove_outlier(df, 'price', [99]) # Latitude & Longitude: # removing outliers: values in the 1/99 percentiles df = remove_outlier(df, 'latitude', [1, 99]) df = remove_outlier(df, 'longitude', [1, 99])",No,4,8.0 "num_feats = ['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'num_photos', 'num_features', 'num_description_words', 'created_year', 'created_month', 'created_day', 'created_hour', 'created_minute'] X = df[num_feats] y = df['interest_level'] X.head()",Yes,4,21.0 "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)",No,5,13.0 "clf = RandomForestClassifier(n_estimators=1000) clf.fit(X_train, y_train) y_val_pred = clf.predict_proba(X_test) log_loss(y_test, y_val_pred)",Yes,3,4.0 "# fitting the model on the entire data without split clf.fit(X, y)",No,5,7.0 "df = pd.read_json(open(""../input/test.json"", \'r\'))
df[\'num_photos\'] = df[\'photos\'].apply(len)
df[\'num_features\'] = df[\'features\'].apply(len)
df[\'num_description_words\'] = df[\'description\'].apply(lambda x: len(x.split(\' \')))
df[\'created\'] = pd.to_datetime(df[\'created\'])
df[\'created_year\'] = df[\'created\'].dt.year
df[\'created_month\'] = df[\'created\'].dt.month
df[\'created_day\'] = df[\'created\'].dt.day
df[\'created_hour\'] = df[\'created\'].dt.hour
df[\'created_minute\'] = df[\'created\'].dt.minute
X = df[num_feats]",Yes,4,8.0 y = clf.predict_proba(X),No,5,27.0 "from sklearn.ensemble import ExtraTreesClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import log_loss",No,5,22.0 "clf = ExtraTreesClassifier(n_estimators=100) clf.fit(X_train, y_train) y_val_pred = clf.predict_proba(X_val) log_loss(y_val, y_val_pred)",Yes,3,4.0 "import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from subprocess import check_output

# Load and describe data
# print(check_output([""ls"", ""../input""]).decode(""utf8""))
df = pd.read_json(open(""../input/train.json"", ""r""))
df[\'n_photos\'] = df[\'photos\'].apply(len)
df[\'n_features\'] = df[\'features\'].apply(len)
df[\'ilevel_categ\'] = df[\'interest_level\'].map({\'low\': 1, \'medium\': 2, \'high\': 3})
df[""n_description_words""] = df[""description""].apply(lambda x: len(x.split("" "")))
df.info()",No,4,8.0 "import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt #plotting library %matplotlib inline import seaborn as sns #plotting library sns.set(color_codes=True) sns.set_style(""white"") ",No,5,23.0 "import sklearn.ensemble from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, log_loss ",No,5,22.0 "# This Python 3 environment comes with many helpful analytics libraries installed
train_data = pd.read_json(""../input/train.json"")
test_data = pd.read_json(""../input/test.json"")
display_count = 2
target = \'interest_level\'
",Yes,4,44.0 "train_data.iloc[0] ",No,5,14.0 "train_data['rooms'] = train_data['bedrooms'] + train_data['bathrooms'] train_data['living_rooms'] = train_data['bedrooms'] - train_data['bathrooms'] train_data['even_rooms'] = train_data['rooms'].apply(lambda x : (x%2) == 0) ",No,5,8.0 "def price_per_room(row): rooms = row['rooms'] if rooms == 0: return -1 price_per_room = row['price'] / rooms return price_per_room train_data['price_per_room'] = train_data.apply(lambda row: price_per_room(row), axis=1) ",No,5,8.0 "train_data['created'] = pd.to_datetime(train_data['created']) train_data['year'] = train_data['created'].dt.year train_data['month'] = train_data['created'].dt.month train_data['day'] = train_data['created'].dt.day train_data['hour'] = train_data['created'].dt.hour train_data['month'] = train_data['month'].apply(lambda x: '0' + str(x) if len(str(x)) == 1 else str(x)) train_data['day'] = train_data['day'].apply(lambda x: '0' + str(x) if len(str(x)) == 1 else str(x)) ",No,5,8.0 "train_data['MMDD'] = train_data.apply(lambda x: str(x.month) + str(x.day), axis=1) ",No,5,8.0 "month_counts = train_data.groupby(['day', 'hour']).agg({target: 'count'}) month_counts = month_counts.sort_values(target, ascending=False) ",Yes,3,60.0 "train_data['months_with_less_listings'] = train_data.apply(lambda x: 0 if x['month'] == 31 else 1, axis=1) ",No,5,8.0 "build_group = train_data.groupby([target, 'building_id']) ",No,5,60.0 "buildings_with_all_listings = unstacked_df.ix[unstacked_df['frequency'] > 1] ",No,5,14.0 print(df.shape),No,5,58.0 "clf = RandomForestClassifier(n_estimators=1000) clf.fit(X_train, y_train) y_val_pred = clf.predict_proba(X_val) print(y_val_pred.shape) log_loss(y_val, y_val_pred)",Yes,3,4.0 "df = pd.read_json(open(""../input/test.json"", ""r"")) print(df.shape) df[""num_photos""] = df[""photos""].apply(len) df[""num_features""] = df[""features""].apply(len) df[""num_description_words""] = df[""description""].apply(lambda x: len(x.split("" ""))) df[""created""] = pd.to_datetime(df[""created""]) df[""created_year""] = df[""created""].dt.year df[""created_month""] = df[""created""].dt.month df[""created_day""] = df[""created""].dt.day X = df[num_feats] y = clf.predict_proba(X)",Yes,4,8.0 "sub = pd.DataFrame() sub[""listing_id""] = df[""listing_id""] for label in [""high"", ""medium"", ""low""]: sub[label] = y[:, labels2idx[label]] sub.to_csv(""submission_rf.csv"", index=False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output([""ls"", ""../input""]).decode(""utf8""))

# Any results you write to the current directory are saved as output.
",No,5,88.0 "df = pd.read_json(open(""../input/train.json"", ""r"")) df.head(5) ",Yes,3,44.0 "df.shape ",No,5,58.0 "numeric_features = [""bathrooms"", ""bedrooms"", ""latitude"", ""longitude"", ""price""] X = df[numeric_features] y = df[""interest_level""] X.head() ",No,3,21.0 "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33) random_forest_classifier = RandomForestClassifier(n_estimators=1500) random_forest_classifier.fit(X_train, y_train) y_val_pred = random_forest_classifier.predict_proba(X_val) log_loss(y_val, y_val_pred) ",Yes,3,13.0 "df_test = pd.read_json(open(""../input/test.json"", ""r"")) X_test = df_test[numeric_features] y_test = random_forest_classifier.predict_proba(X_test) ",Yes,3,44.0 "plt.figure(figsize=(8, 4))
price_group.head(5)[""total_price""].plot(kind=\'barh\',color=""orange"")
plt.show()
plt.close()
",No,4,33.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output([""ls"", ""../input""]).decode(""utf8""))

# Any results you write to the current directory are saved as output.
",No,5,88.0 "train_data[\'building_id\'] = train_data[\'building_id\'].apply(lambda x: 1 if x != ""0"" else 0)
",No,5,8.0 "# Read the training and test data train_df = pd.read_json(""../input/train.json"") test_df = pd.read_json(""../input/test.json"") #Look at the size of test and train data print(""train data shape: "", train_df.shape[0]); print(""test data shape: "", test_df.shape[0]); ",Yes,3,44.0 "def split_X_y(train_data, features): X = train_data[features] y = train_data[target] return X, y ",No,4,21.0 "# Convert the features like features, photos, description into numeric by computing their length
# Generate hash for the building_id, manager_id
train_df[""num_photos""] = train_df[""photos""].apply(len)
train_df[""num_features""] = train_df[""features""].apply(len)
train_df[""num_description_words""] = train_df[""description""].apply(lambda x: len(x.split("" "")))
train_df[""building_gen_id""] = train_df[""building_id""].apply(lambda x: x.encode(\'utf-8\'))
train_df[""building_gen_id""] = train_df[""building_gen_id""].apply(lambda x: string2numeric_hash(x))
train_df[""manager_gen_id""] = train_df[""manager_id""].apply(lambda x: x.encode(\'utf-8\'))
train_df[""manager_gen_id""] = train_df[""manager_gen_id""].apply(lambda x: string2numeric_hash(x))


test_df[""num_photos""] = test_df[""photos""].apply(len)
test_df[""num_features""] = test_df[""features""].apply(len)
test_df[""num_description_words""] = test_df[""description""].apply(lambda x: len(x.split("" "")))
test_df[""building_gen_id""] = test_df[""building_id""].apply(lambda x: x.encode(\'utf-8\'))
test_df[""building_gen_id""] = test_df[""building_gen_id""].apply(lambda x: string2numeric_hash(x))
test_df[""manager_gen_id""] = test_df[""manager_id""].apply(lambda x: x.encode(\'utf-8\'))
test_df[""manager_gen_id""] = test_df[""manager_gen_id""].apply(lambda x: string2numeric_hash(x))
",No,5,8.0 "# Select the features and prepare the Input and target variables selected_features = [""bedrooms"", ""bathrooms"", ""price"", ""num_photos"", ""num_features"", ""num_description_words"", ""building_gen_id"", ""manager_gen_id""] X = train_df[selected_features] Y = train_df[""interest_level""] X.head() ",Yes,4,21.0 "#Split the input into training and validation sets from sklearn.model_selection import train_test_split X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.33) #Pass the input to the algo and calculate the loss from sklearn.ensemble import RandomForestClassifier algo = RandomForestClassifier(n_estimators=100) algo.fit(X_train, Y_train) y_predict_val = algo.predict_proba(X_val) from sklearn.metrics import log_loss log_loss(Y_val, y_predict_val) ",Yes,3,13.0 "X_test = test_df[selected_features] y_predict_test = algo.predict_proba(X_test) ",No,5,48.0 "sub = pd.DataFrame() sub[""listing_id""] = test_df[""listing_id""] for label in [""high"", ""medium"", ""low""]: sub[label] = y_predict_test[:, labels2idx[label]] sub.to_csv(""categoral_numeric.csv"", index=False) print(""process is done1"") ",No,5,25.0 "import numpy as np import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import StratifiedKFold import random from math import exp import xgboost as xgb from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import log_loss from collections import defaultdict, Counter random.seed(4321) np.random.seed(4321) train_df = pd.read_json(""../input/train.json"") test_df = pd.read_json(""../input/test.json"") train_test = pd.concat([train_df, test_df], 0) ",No,4,44.0 "print(train_df.shape) print(test_df.shape) ",No,5,58.0 "train_df.head(5) ",No,5,41.0 "df = pd.read_json(open(""../input/train.json"", ""r"")) df[""num_photos""] = df[""photos""].apply(len) df[""num_features""] = df[""features""].apply(len) df[""num_description_words""] = df[""description""].apply(lambda x: len(x.split("" ""))) df[""created""] = pd.to_datetime(df[""created""]) df[""created_year""] = df[""created""].dt.year df[""created_month""] = df[""created""].dt.month df[""created_day""] = df[""created""].dt.day ",Yes,4,8.0 "num_feats = [""bathrooms"", ""bedrooms"", ""latitude"", ""longitude"", ""price"", ""num_photos"", ""num_features"", ""num_description_words"", ""created_year"", ""created_month"", ""created_day""] X = df[num_feats] y = df[""interest_level""] X.head() ",Yes,4,21.0 "X_train.loc[10000,""photos""] ",No,5,14.0 "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33) clf = RandomForestClassifier(n_estimators=1000) clf.fit(X_train, y_train) y_val_pred = clf.predict_proba(X_val) log_loss(y_val, y_val_pred) ",Yes,3,13.0 "df = pd.read_json(open(""../input/test.json"", ""r"")) print(df.shape) df[""num_photos""] = df[""photos""].apply(len) df[""num_features""] = df[""features""].apply(len) df[""num_description_words""] = df[""description""].apply(lambda x: len(x.split("" ""))) df[""created""] = pd.to_datetime(df[""created""]) df[""created_year""] = df[""created""].dt.year df[""created_month""] = df[""created""].dt.month df[""created_day""] = df[""created""].dt.day X = df[num_feats] y = clf.predict_proba(X) ",Yes,4,8.0 "sub = pd.DataFrame() sub[""listing_id""] = df[""listing_id""] for label in [""high"", ""medium"", ""low""]: sub[label] = y[:, labels2idx[label]] sub.to_csv(""submission_rf.csv"", index=False) ",No,5,25.0 "%matplotlib inline
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import time as time
from sklearn.preprocessing import StandardScaler, Imputer, LabelBinarizer
from sklearn.metrics import log_loss
from sklearn.neural_network import MLPClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import make_pipeline

def get_skf_indexes(df, target, kfold=4):
X = df.values
y = df[target].values
skf = StratifiedKFold(n_splits=4);
skf.get_n_splits(X, y);
indexes = [[],[]]
for train_index, test_index in skf.split(X, y):
indexes[0].append(train_index)
indexes[1].append(test_index)
return indexes


def output_results(clf, x_test, listing, fname):
preds = clf.predict_proba(x_test)
preds = pd.DataFrame(preds)
cols = [\'low\', \'medium\', \'high\']
preds.columns = cols
preds[\'listing_id\'] = listing
preds.to_csv(fname, index=None)
print(preds[cols].mean().values)


def basic_preprocess(df_train, df_test, n_min=50, precision=3):

# Interest: Numerical encoding of interest level
df_train[\'y\'] = 0.0
df_train.loc[df_train.interest_level==\'medium\', \'y\'] = 1.0
df_train.loc[df_train.interest_level==\'high\', \'y\'] = 2.0

# Location features: Latitude, longitude
df_train[\'num_latitude\'] = df_train.latitude.values
df_test[\'num_latitude\'] = df_test.latitude.values
df_train[\'num_longitude\'] = df_train.longitude.values
df_test[\'num_longitude\'] = df_test.longitude.values
x = np.sqrt(((df_train.latitude - df_train.latitude.median())**2) + (df_train.longitude - df_train.longitude.median())**2)
df_train[\'num_dist_from_center\'] = x.values
x = np.sqrt(((df_test.latitude - df_train.latitude.median())**2) + (df_test.longitude - df_train.longitude.median())**2)
df_test[\'num_dist_from_center\'] = x.values
df_train[\'pos\'] = df_train.longitude.round(precision).astype(str) + \'_\' + df_train.latitude.round(precision).astype(str)
df_test[\'pos\'] = df_test.longitude.round(precision).astype(str) + \'_\' + df_test.latitude.round(precision).astype(str)

# Degree of ""outlierness""
OutlierAggregated = (df_train.bedrooms > 4).astype(float)
OutlierAggregated2 = (df_test.bedrooms > 4).astype(float)
OutlierAggregated += (df_train.bathrooms > 3).astype(float)
OutlierAggregated2 += (df_test.bathrooms > 3).astype(float)
OutlierAggregated += (df_train.bathrooms < 1).astype(float)
OutlierAggregated2 += (df_test.bathrooms < 1).astype(float)
x = np.abs((df_train.price - df_train.price.median())/df_train.price.std()) > 0.30
OutlierAggregated += x.astype(float)
x2 = np.abs((df_test.price - df_train.price.median())/df_train.price.std()) > 0.30
OutlierAggregated2 += x2.astype(float)
x = np.log1p(df_train.price/(df_train.bedrooms.clip(1,3) + df_train.bathrooms.clip(1,2))) > 8.2
OutlierAggregated += x.astype(float)
x2 = np.log1p(df_test.price/(df_test.bedrooms.clip(1,3) + df_test.bathrooms.clip(1,2))) > 8.2
OutlierAggregated2 += x2.astype(float)
x = np.sqrt(((df_train.latitude - df_train.latitude.median())**2) + (df_train.longitude - df_train.longitude.median())**2) > 0.30
OutlierAggregated += x.astype(float)
x2 = np.sqrt(((df_test.latitude - df_train.latitude.median())**2) + (df_test.longitude - df_train.longitude.median())**2) > 0.30
OutlierAggregated2 += x2.astype(float)
df_train[\'num_OutlierAggregated\'] = OutlierAggregated.values
df_test[\'num_OutlierAggregated\'] = OutlierAggregated2.values

# Average interest in unique locations at given precision
x = df_train.groupby(\'pos\')[\'y\'].aggregate([\'count\', \'mean\'])
d = x.loc[x[\'count\'] >= n_min, \'mean\'].to_dict()
impute = df_train.y.mean()
df_train[\'num_pos\'] = df_train.pos.apply(lambda x: d.get(x, impute))
df_test[\'num_pos\'] = df_test.pos.apply(lambda x: d.get(x, impute))

# Density in unique locations at given precision
vals = df_train[\'pos\'].value_counts()
dvals = vals.to_dict()
df_train[\'num_pos_density\'] = df_train[\'pos\'].apply(lambda x: dvals.get(x, vals.min()))
df_test[\'num_pos_density\'] = df_test[\'pos\'].apply(lambda x: dvals.get(x, vals.min()))

# Building null
df_train[\'num_building_null\'] = (df_train.building_id==\'0\').astype(float)
df_test[\'num_building_null\'] = (df_test.building_id==\'0\').astype(float)

# Building supervised
x = df_train.groupby(\'building_id\')[\'y\'].aggregate([\'count\', \'mean\'])
d = x.loc[x[\'count\'] >= n_min, \'mean\'].to_dict()
impute = df_train.y.mean()
df_train[\'num_building_id\'] = df_train.building_id.apply(lambda x: d.get(x, impute))
df_test[\'num_building_id\'] = df_test.building_id.apply(lambda x: d.get(x, impute))

# Building frequency
d = np.log1p(df_train.building_id.value_counts()).to_dict()
impute = np.min(np.array(list(d.values())))
df_train[\'num_fbuilding\'] = df_train.building_id.apply(lambda x: d.get(x, impute))
df_test[\'num_fbuilding\'] = df_test.building_id.apply(lambda x: d.get(x, impute))

# Manager supervised
x = df_train.groupby(\'manager_id\')[\'y\'].aggregate([\'count\', \'mean\'])
d = x.loc[x[\'count\'] >= n_min, \'mean\'].to_dict()
impute = df_train.y.mean()
df_train[\'num_manager\'] = df_train.manager_id.apply(lambda x: d.get(x, impute))
df_test[\'num_manager\'] = df_test.manager_id.apply(lambda x: d.get(x, impute))

# Manager frequency
d = np.log1p(df_train.manager_id.value_counts()).to_dict()
impute = np.min(np.array(list(d.values())))
df_train[\'num_fmanager\'] = df_train.manager_id.apply(lambda x: d.get(x, impute))
df_test[\'num_fmanager\'] = df_test.manager_id.apply(lambda x: d.get(x, impute))

# Creation time features
df_train[\'created\'] = pd.to_datetime(df_train.created)
df_train[\'num_created_weekday\'] = df_train.created.dt.dayofweek.astype(float)
df_train[\'num_created_weekofyear\'] = df_train.created.dt.weekofyear
df_test[\'created\'] = pd.to_datetime(df_test.created)
df_test[\'num_created_weekday\'] = df_test.created.dt.dayofweek
df_test[\'num_created_weekofyear\'] = df_test.created.dt.weekofyear

# Bedrooms/Bathrooms/Price
df_train[\'num_bathrooms\'] = df_train.bathrooms.clip_upper(4)
df_test[\'num_bathrooms\'] = df_test.bathrooms.clip_upper(4)
df_train[\'num_bedrooms\'] = df_train.bedrooms.clip_upper(5)
df_test[\'num_bedrooms\'] = df_test.bedrooms.clip_upper(5)
df_train[\'num_price\'] = df_train.price.clip_upper(10000)
df_test[\'num_price\'] = df_test.price.clip_upper(10000)
bins = df_train.price.quantile(np.arange(0.05, 1, 0.05))
df_train[\'num_price_q\'] = np.digitize(df_train.price, bins)
df_test[\'num_price_q\'] = np.digitize(df_test.price, bins)

# Composite features based on:
# https://www.kaggle.com/arnaldcat/two-sigma-connect-rental-listing-inquiries/a-proxy-for-sqft-and-the-interest-on-1-2-baths
df_train[\'num_priceXroom\'] = (df_train.price / (1 + df_train.bedrooms.clip(1, 4) + 0.5*df_train.bathrooms.clip(0, 2))).values
df_test[\'num_priceXroom\'] = (df_test.price / (1 + df_test.bedrooms.clip(1, 4) + 0.5*df_test.bathrooms.clip(0, 2))).values
df_train[\'num_even_bathrooms\'] = ((np.round(df_train.bathrooms) - df_train.bathrooms)==0).astype(float)
df_test[\'num_even_bathrooms\'] = ((np.round(df_test.bathrooms) - df_test.bathrooms)==0).astype(float)

# Other features
df_train[\'num_features\'] = df_train.features.apply(lambda x: len(x))
df_test[\'num_features\'] = df_test.features.apply(lambda x: len(x))
df_train[\'num_photos\'] = df_train.photos.apply(lambda x: len(x))
df_test[\'num_photos\'] = df_test.photos.apply(lambda x: len(x))
df_train[\'num_desc_length\'] = df_train.description.str.split(\' \').str.len()
df_test[\'num_desc_length\'] = df_test.description.str.split(\' \').str.len()
df_train[\'num_desc_length_null\'] = (df_train.description.str.len()==0).astype(float)
df_test[\'num_desc_length_null\'] = (df_test.description.str.len()==0).astype(float)

# Features/Description Features
bows = {\'nofee\': [\'no fee\', \'no-fee\', \'no fee\', \'nofee\', \'no_fee\'],
\'lowfee\': [\'reduced_fee\', \'low_fee\',\'reduced fee\', \'low fee\'],
\'furnished\': [\'furnished\'],
\'parquet\': [\'parquet\', \'hardwood\'],
\'concierge\': [\'concierge\', \'doorman\', \'housekeep\',\'in_super\'],
\'prewar\': [\'prewar\', \'pre_war\', \'pre war\', \'pre-war\'],
\'laundry\': [\'laundry\', \'lndry\'],
\'health\': [\'health\', \'gym\', \'fitness\', \'training\'],
\'transport\': [\'train\', \'subway\', \'transport\'],
\'parking\': [\'parking\'],
\'utilities\': [\'utilities\', \'heat water\', \'water included\']
}
for fname, bow in bows.items():
x1 = df_train.description.str.lower().apply(lambda x: np.sum([1 for i in bow if i in x]))
x2 = df_train.features.apply(lambda x: np.sum([1 for i in bow if i in \' \'.join(x).lower()]))
df_train[\'num_\'+fname] = ((x1 + x2) > 0).astype(float).values
x1 = df_test.description.str.lower().apply(lambda x: np.sum([1 for i in bow if i in x]))
x2 = df_test.features.apply(lambda x: np.sum([1 for i in bow if i in \' \'.join(x).lower()]))
df_test[\'num_\'+fname] = ((x1 + x2) > 0).astype(float).values

return df_train, df_test",No,5,53.0 "def build_model(name): clf = None if name == 'Random Forest': clf = RandomForestClassifier() if name == 'gbm': clf = GradientBoostingClassifier() return clf ",No,5,4.0 "def fit_model(clf, X_train, y_train): return clf.fit(X_train, y_train) ",No,5,7.0 "df = pd.read_json('../input/train.json') df_test = pd.read_json('../input/test.json') df['created'] = pd.to_datetime(df.created) df_test['created'] = pd.to_datetime(df_test.created)",Yes,3,44.0 "def model_and_predict(model_name, data, features): X, y = split_X_y(data, features) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0) clf = build_model(model_name) clf = fit_model(clf, X_train, y_train) y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(""The accuracy is {}"".format(accuracy)) y_proba = clf.predict_proba(X_test) log_loss_score = log_loss(y_test, y_proba) print(""The log_loss_score is {}"".format(log_loss_score)) return clf, accuracy, log_loss_score ",Yes,3,48.0 "# Normalize for i in range(x_train.shape[1]): x_test[:, i] = (x_test[:, i] - np.mean(x_train[:, i]))/np.std(x_train[:, i]) x_train[:, i] = (x_train[:, i] - np.mean(x_train[:, i]))/np.std(x_train[:, i])",No,5,18.0 "train_features = ['bathrooms', 'rooms', 'living_rooms', 'building_id', 'price_level', 'months_with_less_listings'] clf, accuracy, log_score = model_and_predict('Random Forest', train_copy, train_features) scores.append(accuracy) log_scores.append(log_score) ",Yes,4,27.0 "test_data[\'rooms\'] = test_data[\'bedrooms\'] + test_data[\'bathrooms\']
test_data[\'living_rooms\'] = test_data[\'bedrooms\'] - test_data[\'bathrooms\']
test_data[\'even_rooms\'] = test_data[\'rooms\'].apply(lambda x : (x%2) == 0)

test_data[\'created\'] = pd.to_datetime(test_data[\'created\'])
test_data[\'year\'] = test_data[\'created\'].dt.year
test_data[\'month\'] = test_data[\'created\'].dt.month
test_data[\'day\'] = test_data[\'created\'].dt.day
test_data[\'hour\'] = test_data[\'created\'].dt.hour

test_data[\'price_per_room\'] = test_data.apply(lambda row: price_per_room(row), axis=1)
test_data[\'price_level\'] = test_data[\'price\'].apply(lambda x: 1 if x<2000 else 0)
test_data[\'building_id\'] = test_data[\'building_id\'].apply(lambda x: 1 if x != ""0"" else 0)

test_data[\'months_with_less_listings\'] = test_data.apply(lambda x: 0 if x[\'month\'] == 31 else 1, axis=1)
",No,5,8.0 "X_test = test_data[train_features] y_proba = clf.predict_proba(X_test) ",No,5,48.0 "import os import sys import operator import numpy as np import pandas as pd from scipy import sparse import xgboost as xgb import random from sklearn import model_selection, preprocessing, ensemble from sklearn.metrics import log_loss from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer #input data train_df=pd.read_json('../input/train.json') test_df=pd.read_json('../input/test.json')",No,5,44.0 "#basic features train_df[""price_t""] =train_df[""price""]/train_df[""bedrooms""] test_df[""price_t""] = test_df[""price""]/test_df[""bedrooms""] train_df[""room_sum""] = train_df[""bedrooms""]+train_df[""bathrooms""] test_df[""room_sum""] = test_df[""bedrooms""]+test_df[""bathrooms""] # count of photos # train_df[""num_photos""] = train_df[""photos""].apply(len) test_df[""num_photos""] = test_df[""photos""].apply(len) # count of ""features"" # train_df[""num_features""] = train_df[""features""].apply(len) test_df[""num_features""] = test_df[""features""].apply(len) # count of words present in description column # train_df[""num_description_words""] = train_df[""description""].apply(lambda x: len(x.split("" ""))) test_df[""num_description_words""] = test_df[""description""].apply(lambda x: len(x.split("" ""))) features_to_use=[""bathrooms"", ""bedrooms"", ""latitude"", ""longitude"", ""price"",""price_t"",""num_photos"", ""num_features"", ""num_description_words"",""listing_id""]",No,5,8.0 " train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr() test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr() target_num_map = {'high':0, 'medium':1, 'low':2} train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x])) print(train_X.shape, test_X.shape)",Yes,4,21.0 "results_df = pd.DataFrame(results) ",No,5,12.0 "result = pd.concat([results_df, id_df], axis=1) ",No,5,11.0 "train[\'display_address\'] = train[\'display_address\'].apply(lambda x: x.strip("".""))
train[\'display_address\'] = train[\'display_address\'].apply(lambda x: x.lower())
ga = train.groupby([\'display_address\'])[\'display_address\'].count().fillna(0)
ga = pd.DataFrame(ga)
ga.columns = [\'display_count\']
ga[\'display_address\'] = ga.index
ga.loc[ga[\'display_address\'] == \'\',\'display_count\'] = 0
pd.DataFrame(ga)",No,3,8.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split


# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output([""ls"", ""../input""]).decode(""utf8""))

# Any results you write to the current directory are saved as output.",No,5,88.0 "train = pd.read_json('../input/train.json') ### and test if everything OK train.head()",Yes,3,44.0 "train['num_photos'] = train['photos'].apply(len) train['num_features'] = train['features'].apply(len) train['num_description_words'] = train['description'].apply(lambda x: len(x.split(' '))) train['rooms'] = train['bathrooms'] + train['bedrooms'] ulimit = np.percentile(train.price.values, 99) train['price'].loc[train['price']>ulimit] = ulimit train['rooms_per_price'] = train['rooms']/train['price'] train = train[train['bedrooms'] > 0] train['bath_per_beds'] = train['bathrooms']/train['bedrooms'] train.loc[train['bath_per_beds'] > 999999999999,'bath_per_beds'] = 0",No,5,8.0 "X = train[['bathrooms','bedrooms','price','num_photos', 'num_features','num_description_words','rooms','rooms_per_price', 'bath_per_beds','latitude','longitude','building_count','manager_count','display_count']] y = train['labels'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)",Yes,4,21.0 "gbc = GradientBoostingClassifier(loss='deviance', learning_rate=0.05, n_estimators=600, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=5, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto') gbc.fit(X,y) out = gbc.predict_proba(X_test)",Yes,4,4.0 "print(log_loss(y_test,out))",No,5,49.0 " test.loc[test['bedrooms'] == 0,'bath_per_beds'] = 0 X = test[['bathrooms','bedrooms','price','num_photos', 'num_features','num_description_words','rooms','room_per_price', 'bath_per_beds','latitude','longitude','building_count','manager_count','display_count']] X = X.fillna(0)",No,5,17.0 out = gbc.predict_proba(X),No,5,48.0 "def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
param = {}
param[\'objective\'] = \'multi:softprob\'
param[\'eta\'] = 0.03
param[\'max_depth\'] = 6
param[\'silent\'] = 1
param[\'num_class\'] = 3
param[\'eval_metric\'] = ""mlogloss""
param[\'min_child_weight\'] = 1
param[\'subsample\'] = 0.7
param[\'colsample_bytree\'] = 0.7
param[\'seed\'] = seed_val
num_rounds = num_rounds

plst = list(param.items())
xgtrain = xgb.DMatrix(train_X, label=train_y)

if test_y is not None:
xgtest = xgb.DMatrix(test_X, label=test_y)
watchlist = [ (xgtrain,\'train\'), (xgtest, \'test\') ]
model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
else:
xgtest = xgb.DMatrix(test_X)
model = xgb.train(plst, xgtrain, num_rounds)

pred_test_y = model.predict(xgtest)
return pred_test_y, model",Yes,3,59.0 "result.head(3) ",No,5,41.0 "result.to_csv(""submission.csv"", index=False) ",No,5,25.0 "import numpy as np import pandas as pd ",No,5,22.0 "data_path = ""../input/""
train_file = data_path + ""train.json""
test_file = data_path + ""test.json""
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)

train_df = train_df.fillna(\'\')
test_df = test_df.fillna(\'\')

train_df[\'photos_num\'] = train_df.photos.apply(lambda x: len(x))
test_df[\'photos_num\'] = test_df.photos.apply(lambda x: len(x))

train_df[\'features_num\'] = train_df.features.apply(lambda x: len(x))
test_df[\'features_num\'] = test_df.features.apply(lambda x: len(x))

print(\'Shape of train dataset = \' + str(train_df.shape))
print(\'Shape of test dataset = \' + str(test_df.shape))",Yes,3,8.0 "cols = ['bathrooms', 'bedrooms', 'building_id', \\
'description', 'display_address', 'latitude', \\
'longitude', 'manager_id', 'price', 'street_address', \\
'photos_num', 'features_num']
df_merged = pd.merge(train_df, test_df, \\
on=cols, \\
suffixes=('_train', '_test'), how='right')
df_merged = df_merged.rename(columns={'listing_id_test': 'listing_id'})
df_merged.head()",Yes,3,32.0 "fname = 'sample_submission.csv' subm = pd.read_csv(data_path + fname) subm = subm.merge(df_merged[['listing_id','interest_level']], on='listing_id')",Yes,4,45.0 print('Number of duplicates = ' + str(np.sum(subm.duplicated(subset='listing_id')))),No,4,38.0 "subm.sort_values('listing_id').loc[subm.duplicated(subset='listing_id', keep=False)].head(10)",Yes,4,41.0 "print('Number of duplicates in train = ' + \\
str(np.sum(train_df.duplicated(subset=cols, keep=False))))
print('Number of duplicates in test = ' + \\
str(np.sum(test_df.duplicated(subset=cols, keep=False))))",No,5,38.0 "subm.low.loc[subm.interest_level=='low'] = 1.0 subm.medium.loc[subm.interest_level=='low'] = 0.0 subm.high.loc[subm.interest_level=='low'] = 0.0 subm.low.loc[subm.interest_level=='medium'] = 0.0 subm.medium.loc[subm.interest_level=='medium'] = 1.0 subm.high.loc[subm.interest_level=='medium'] = 0.0 subm.low.loc[subm.interest_level=='high'] = 0.0 subm.medium.loc[subm.interest_level=='high'] = 0.0 subm.high.loc[subm.interest_level=='high'] = 1.0 subm = subm.groupby('listing_id').mean() print('subm.shape = ' + str(subm.shape)) subm.head()",Yes,4,14.0 "subm.to_csv('submission.csv', index=True)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for visualization
import seaborn as sns
from sklearn import linear_model
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier, plot_importance

%matplotlib inline
### Seaborn style
sns.set_style(""whitegrid"")

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

trainingData = pd.read_json(\'../input/train.json\')


#trainingData[\'building_id\'] = trainingData[\'building_id\'].to_string

trainingData.info()",Yes,4,44.0 trainingData.isnull().sum(),No,5,39.0 "sns.countplot(trainingData.interest_level, order=['low', 'medium', 'high']); plt.xlabel('Interest Level'); plt.ylabel('Number of occurrences');",No,5,33.0 " trainingData['numPics'] = trainingData['photos'].apply(len) trainingData.info()",Yes,4,8.0 "features = [x for sublist in trainingData[\'features\'] for x in sublist]

for x in features:
if ""*"" in x: features.remove(x)

features = set(features)

features.discard(\'\')

has_ac = [ s for s in features if any(ac_name in s for ac_name in [\'A/C\', ""AC"", ""Air Conditioning""] ) and not(any(wrong in s for wrong in [""FIRE"",\'ACT\',\'APT\', \'SPACE\',\'YARD\'])) ]

trainingData[\'has_ac\'] = [any(ac in feature for ac in has_ac) for feature in trainingData[\'features\'] ]

free_included = [ s for s in features if any(ac_name in s for ac_name in [""free"",""FREE"",""Free"", ""Gift"", ""gift"", \'1/2 Month fee\', ""included"", ""INCLUDED"",""Included""] ) ]

trainingData[""included_offer""] = [any(free in feature for free in free_included) for feature in trainingData[\'features\']]

doorman = [ s for s in features if any(ac_name in s for ac_name in [""doorman"",""DOORMAN"",""Doorman"",\'doormen\',\'Doormen\', \'full-service\', \'concierge\',\'Concierge\',\'Attended Lobby\', \'Attended lobby\', \'attended lobby\'] ) ]

trainingData[""concierge""] = [any(door in feature for door in doorman) for feature in trainingData[\'features\']]

Washer = [ s for s in features if any(ac_name in s for ac_name in [\'Washer\', ""Dryer"",\'Washer\',\'Dryer\',\'washer\',\'dryer\',\'laundry\',\'LAUNDRY\',\'Laundry\'] ) and not(any(notname in s for notname in [\'dish\',\'DISH\',\'Dish\', \'Disw\'] )) ]

trainingData[""laundry""] = [any(laundry in feature for laundry in Washer) for feature in trainingData[\'features\']]",No,5,8.0 "labelEncoder = LabelEncoder() trainingData['interest'] = labelEncoder.fit_transform(trainingData['interest_level']) trainingDataSub = trainingData.loc[trainingData['interest']==0] trainingDataSub = trainingDataSub.append(trainingData.loc[trainingData['interest']==1].sample(15000)) trainingDataSub = trainingDataSub.append(trainingData.loc[trainingData['interest']==2].sample(10000)) y = trainingDataSub['interest'] X = trainingDataSub[['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'numPics', 'has_ac', 'included_offer', 'concierge', 'laundry']] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=52)",Yes,3,21.0 trainingData.dtypes,No,5,70.0 "from sklearn import neural_network regr = neural_network.MLPClassifier(hidden_layer_sizes = (50,50,10)) regr.fit(X_train, y_train)",Yes,4,4.0 "deepfor = RandomForestClassifier(n_estimators=3, random_state=52) deepfor.fit(X_train, y_train) ",Yes,3,4.0 "testingData = pd.read_json('../input/test.json') testingData['numPics'] = testingData['photos'].apply(len)",Yes,4,44.0 "features = [x for sublist in testingData[\'features\'] for x in sublist]

for x in features:
if ""*"" in x: features.remove(x)

features = set(features)

features.discard(\'\')

has_ac = [ s for s in features if any(ac_name in s for ac_name in [\'A/C\', ""AC"", ""Air Conditioning""] ) and not(any(wrong in s for wrong in [""FIRE"",\'ACT\',\'APT\', \'SPACE\',\'YARD\'])) ]

testingData[\'has_ac\'] = [any(ac in feature for ac in has_ac) for feature in testingData[\'features\'] ]

free_included = [ s for s in features if any(ac_name in s for ac_name in [""free"",""FREE"",""Free"", ""Gift"", ""gift"", \'1/2 Month fee\', ""included"", ""INCLUDED"",""Included""] ) ]

testingData[""included_offer""] = [any(free in feature for free in free_included) for feature in testingData[\'features\']]

doorman = [ s for s in features if any(ac_name in s for ac_name in [""doorman"",""DOORMAN"",""Doorman"",\'doormen\',\'Doormen\', \'full-service\', \'concierge\',\'Concierge\',\'Attended Lobby\', \'Attended lobby\', \'attended lobby\'] ) ]

testingData[""concierge""] = [any(door in feature for door in doorman) for feature in testingData[\'features\']]

Washer = [ s for s in features if any(ac_name in s for ac_name in [\'Washer\', ""Dryer"",\'Washer\',\'Dryer\',\'washer\',\'dryer\',\'laundry\',\'LAUNDRY\',\'Laundry\'] ) and not(any(notname in s for notname in [\'dish\',\'DISH\',\'Dish\', \'Disw\'] )) ]

testingData[""laundry""] = [any(laundry in feature for laundry in Washer) for feature in testingData[\'features\']]",No,4,8.0 "X = testingData[[\'bathrooms\', \'bedrooms\', \'latitude\', \'longitude\', \'price\', \'numPics\', \'has_ac\', \'included_offer\', \'concierge\', \'laundry\']]

predictions = regr.predict_proba(X)

output = pd.DataFrame(testingData[\'listing_id\'], columns = [\'listing_id\'])

output[\'high\'] = predictions[:,0]
output[\'low\'] = predictions[:,1]
output[\'medium\'] = predictions[:,2]


output.to_csv(""submission1.csv"", index=False)",Yes,3,48.0 "# objective is to predict a number of listing enquiries based on features train = pd.read_json(""../input/train.json"", ""r"") test = pd.read_json(""../input/test.json"", ""r"") sample_sub = pd.read_csv(""../input/sample_submission.csv"")",Yes,4,44.0 "sample_sub.head() # the above is what our submission is supposed to look like",No,5,41.0 from sklearn.naive_bayes import GaussianNB,No,5,22.0 gnb = GaussianNB(),No,5,4.0 "train.index = train['listing_id'] train = train.drop('interest_level', 1) model = gnb.fit(train, train_target)",Yes,3,10.0 y = model.predict_proba(test),No,5,48.0 y_dat = pd.DataFrame(y),No,5,12.0 "#y_dat.copy(deep = False) y_dat.loc[:,'listing_id'] = test.index",No,5,8.0 "y_dat.rename(columns = {'0':'medium', '1':'low', '2':'high'}, inplace = True)",No,5,61.0 data.head(),No,5,41.0 "#medium, low, high
#writer = pd.ExcelWriter(\'/Users/reshmasekar/Desktop/sub.xlsx\', engine=\'xlsxwriter\')
# Convert the dataframe to an XlsxWriter Excel object.
data.to_csv(""sub_rf_4.csv"", index = False)
#y_dat.to_excel(""/Users/reshmasekar/Desktop"")",No,5,25.0 train.head(),No,5,41.0 "from sklearn import neural_network #regr = neural_network.MLPClassifier(hidden_layer_sizes = (50,50,10)) #regr.fit(X_train, y_train)",No,5,22.0 "from sklearn.cluster import KMeans regr = RandomForestClassifier(n_estimators=300, random_state=52) regr.fit(X_train, y_train) pred = regr.predict(X_test) print(confusion_matrix(pred, y_test)) print(accuracy_score(pred, y_test)) print(labelEncoder.classes_)",Yes,3,4.0 "import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np from sklearn import linear_model from os.path import pardir, curdir, join ",No,5,22.0 "df_train = pd.read_csv(""../input/train.csv"") df_store = pd.read_csv(""../input/store.csv"") df_test = pd.read_csv(""../input/test.csv"") ",No,5,45.0 "df_test['Month'] = df_test['Date'].apply(lambda x: int(x[5:7])) df_train['Month'] = df_train['Date'].apply(lambda x: int(x[5:7])) ",No,5,16.0 "df_store['CompetitionDistance'] == df_store['CompetitionDistance'].apply(lambda x: np.log(x)) ",No,5,8.0 "df_test = df_test.fillna(df_test.mean()) ",No,5,17.0 "closed_store_ids = df_test[""Id""][df_test[""Open""] == 0].values df_test = df_test[df_test[""Open""] != 0] ",No,5,14.0 "df_test = df_test.drop(['Date', 'StateHoliday'], axis=1) ",No,5,10.0 "from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import log_loss ",No,5,22.0 "df = pd.read_json(open(""../input/train.json"", ""r"")) print(df.shape) ",Yes,4,44.0 "df.head() ",No,5,41.0 "print(df.shape) ",No,5,58.0 "df[""num_photos""] = df[""photos""].apply(len) df[""num_features""] = df[""features""].apply(len) df[""num_description_words""] = df[""description""].apply(lambda x: len(x.split("" ""))) df[""created""] = pd.to_datetime(df[""created""]) df[""created_year""] = df[""created""].dt.year df[""created_month""] = df[""created""].dt.month df[""created_day""] = df[""created""].dt.day ",No,5,8.0 "num_feats = [""bathrooms"", ""bedrooms"", ""price""] X = df[num_feats] y = df[""interest_level""] X.head() ",Yes,4,21.0 "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33) ",No,5,13.0 "clf = RandomForestClassifier(n_estimators=1000) clf.fit(X_train, y_train) y_val_pred = clf.predict_proba(X_val) log_loss(y_val, y_val_pred) ",Yes,3,4.0 "# Save the test IDs for Kaggle submission test_ids = act_test['activity_id'] def preprocess_acts(data,min_date, train_set=True): # Getting rid of data feature for now dates=data['date'] dates=process_dates(dates,min_date) data = data.drop(['date', 'activity_id'], axis=1) if(train_set): data = data.drop(['outcome'], axis=1) ## Split off _ from people_id data['people_id'] = data['people_id'].apply(lambda x: x.split('_')[1]) data['people_id'] = pd.to_numeric(data['people_id']).astype(int) columns = list(data.columns) # Convert strings to ints for col in columns[1:]: data[col] = data[col].fillna('type 0') data[col] = data[col].apply(lambda x: x.split(' ')[1]) data[col] = pd.to_numeric(data[col]).astype(int) # for column in columns[1:]: # dummies = pd.get_dummies(data[column]) # data[dummies.columns] = dummies data['dates']=dates return data def preprocess_people(data,min_date): dates=data['date'] dates=process_dates(dates,min_date) # TODO refactor this duplication data = data.drop(['date'], axis=1) data['people_id'] = data['people_id'].apply(lambda x: x.split('_')[1]) data['people_id'] = pd.to_numeric(data['people_id']).astype(int) # Values in the people df is Booleans and Strings columns = list(data.columns) bools = columns[11:] strings = columns[1:11] for col in bools: data[col] = pd.to_numeric(data[col]).astype(int) for col in strings: data[col] = data[col].fillna('type 0') data[col] = data[col].apply(lambda x: x.split(' ')[1]) data[col] = pd.to_numeric(data[col]).astype(int) #data = data.drop(['group_1'], axis=1) # for column in strings: # dummies = pd.get_dummies(data[column]) # data[dummies.columns] = dummies data['dates']=dates return data",Yes,2,8.0 "#find minimum date min_date=pd.concat([people['date'],act_train['date'],act_test['date']]).min() min_date",No,3,11.0 "# Preprocess each df min_date=pd.concat([people['date'],act_train['date'],act_test['date']]).min() peeps = preprocess_people(people,min_date) actions_train = preprocess_acts(act_train,min_date,train_set=True) actions_test = preprocess_acts(act_test,min_date,train_set=False) print (peeps.columns) print (actions_train.columns) peeps.sample(10)",Yes,4,71.0 actions_train.sample(10),No,5,41.0 "# Merege into a unified table # Training features = actions_train.merge(peeps, how='left', on='people_id') features=features.drop(['people_id'],axis=1) labels = act_train['outcome'] # Testing test = actions_test.merge(peeps, how='left', on='people_id') test=test.drop(['people_id'],axis=1) # Check it out... features.sample(10)",No,4,32.0 "columnss=list(features.columns) columnss #features['group_1'].nunique()",No,5,71.0 "## Split Training Data from sklearn.cross_validation import train_test_split num_test = 0.10 X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=num_test, random_state=23) ## Out of box random forest from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.metrics import accuracy_score, roc_auc_score #from sklearn.grid_search import GridSearchCV #clf=GradientBoostingClassifier() clf = RandomForestClassifier() clf.fit(X_train, y_train)",Yes,4,7.0 "## Training Predictions proba = clf.predict_proba(X_test) preds = proba[:,1] score = roc_auc_score(y_test, preds) print(""Area under ROC {0}"".format(score))",No,4,49.0 "sub = pd.DataFrame() sub[""listing_id""] = df[""listing_id""] for label in [""high"", ""medium"", ""low""]: sub[label] = y[:, labels2idx[label]] sub.to_csv(""submission_ky.csv"", index=False) ",Yes,3,12.0 "df[num_feats] ",No,5,41.0 "# objective is to predict a number of listing enquiries based on features train = pd.read_json(""../input/train.json"", ""r"") test = pd.read_json(""../input/test.json"", ""r"") sample_sub = pd.read_csv(""../input/sample_submission.csv"") ",Yes,4,44.0 "train[""num_photos""] = train[""photos""].apply(len) train[""num_features""] = train[""features""].apply(len) train[""num_description_words""] = train[""description""].apply(lambda x: len(x.split("" ""))) train[""created""] = pd.to_datetime(train[""created""]) train[""created_year""] = train[""created""].dt.year train[""created_month""] = train[""created""].dt.month train[""created_day""] = train[""created""].dt.day ",No,5,8.0 "test[""num_photos""] = test[""photos""].apply(len) test[""num_features""] = test[""features""].apply(len) test[""num_description_words""] = test[""description""].apply(lambda x: len(x.split("" ""))) test[""created""] = pd.to_datetime(test[""created""]) test[""created_year""] = test[""created""].dt.year test[""created_month""] = test[""created""].dt.month test[""created_day""] = test[""created""].dt.day ",No,5,8.0 "from sklearn.linear_model import LogisticRegression ",No,5,22.0 "lr = LogisticRegression() ",No,5,4.0 "y = model.predict_proba(test) ",No,5,48.0 "y_dat = pd.DataFrame(y) ",No,5,12.0 "#y_dat.copy(deep = False) y_dat.loc[:,'listing_id'] = test.index ",No,5,8.0 "y_dat.rename(columns = {'0':'medium', '1':'low', '2':'high'}, inplace = True) ",No,5,61.0 "data.head() ",No,5,41.0 "# Convert the dataframe to a CSV data.to_csv(""sub_ky_3.csv"", index = False) ",No,5,25.0 "import matplotlib.pyplot as plt plt.style.use('ggplot') plt.rcParams['figure.figsize'] = (12,8) %matplotlib inline",No,4,22.0 "df_stores = pd.read_csv('../input/store.csv', sep=',') df_data = pd.read_csv('../input/train.csv', sep=',') df_test = pd.read_csv('../input/test.csv', sep=',')",No,5,45.0 "print(df_stores.shape) print(df_data.shape) print(df_test.shape)",No,5,58.0 "#
df_stores = df_stores.drop('CompetitionOpenSinceMonth', axis=1).drop('CompetitionOpenSinceYear', axis=1).drop('Promo2SinceWeek', axis=1).drop('Promo2SinceYear', axis=1)",No,5,10.0 df_stores.head(),No,5,41.0 df_data.head(),No,5,41.0 df_test.head(),No,5,41.0 "# ,
closed_stores = df_test[""Id""][df_test[""Open""] == 0].values
df_test = df_test[df_test[""Open""] != 0]
df_data = df_data[df_data[""Open""] != 0]",No,5,14.0 "# ,
df_data = df_data.drop('Open', axis=1).drop('Customers', axis=1)
df_test = df_test.drop('Open', axis=1)",No,5,10.0 "#
df_data['Month'] = df_data['Date'].apply(lambda x: int(x[5:7]))
df_test['Month'] = df_test['Date'].apply(lambda x: int(x[5:7]))
df_data['Year'] = df_data['Date'].apply(lambda x: int(x[:4]))
df_test['Year'] = df_test['Date'].apply(lambda x: int(x[:4]))
df_test = df_test.drop('Date', axis=1)
df_data = df_data.drop('Date', axis=1)",Yes,4,8.0 "# - ,
# . ., .
df_stores = df_stores.join(df_data.groupby('Store')['Sales'].mean(), on='Store').rename(columns={'Sales': 'Av_sales'})",Yes,4,61.0 #
max_s = df_stores['Av_sales'].max()
df_stores['Av_sales']=df_stores['Av_sales']/max_s,No,5,8.0 "df_test = pd.merge(df_test, df_stores, left_index=True, on='Store') df_data = pd.merge(df_data, df_stores, left_index=True, on='Store')",No,5,32.0 "df_data['CompetitionDistance'].fillna(df_data['CompetitionDistance'].median(), inplace = True) df_test['CompetitionDistance'].fillna(df_data['CompetitionDistance'].median(), inplace = True)",No,5,17.0 "max_dist = df_data['CompetitionDistance'].max() df_data['CompetitionDistance']=df_data['CompetitionDistance']/max_dist df_test['CompetitionDistance']=df_test['CompetitionDistance']/max_dist",No,5,8.0 "df_data['PromoInterval'].fillna('n', inplace = True) df_test['PromoInterval'].fillna('n', inplace = True)",No,5,17.0 "# , 0 1, ,
#
print('Days', df_data['DayOfWeek'].unique())
print ('Month', df_data['Month'].unique())
print ('Promo', df_data['Promo'].unique())
print ('StateHoliday', df_data['StateHoliday'].unique())
print ('SchoolHoliday', df_data['SchoolHoliday'].unique())
print ('StoreType', df_data['StoreType'].unique())
print ('Assortment', df_data['Assortment'].unique())
print ('PromoInterval', df_data['PromoInterval'].unique())
print ('Year', df_data['Year'].unique())",No,5,57.0 "df_data[""StateHoliday""].loc[df_data[""StateHoliday""] == 0] = ""0"" df_test[""StateHoliday""].loc[df_test[""StateHoliday""] == 0] = ""0""",No,5,14.0 "print('StateHoliday', df_data['StateHoliday'].unique())",No,5,57.0 #
df_data = df_data[df_data['Sales']!=0],No,5,14.0 df_data = df_data[df_data['Sales']<34500],No,5,14.0 df_data.shape,No,5,58.0 "df_data = df_data.drop('Store', axis=1) df_test = df_test.drop('Store', axis=1)",No,5,10.0 "from sklearn.model_selection import train_test_split df_train, df_mytest = train_test_split(df_data, test_size = 0.2)",No,5,13.0 "X_train, y_train = df_train.drop('Sales', axis=1).values, df_train['Sales'].values",No,5,21.0 "from sklearn.linear_model import LinearRegression model = LinearRegression(n_jobs=4)",No,5,4.0 "model.fit(X_train, y_train)",No,5,7.0 "# Test Set Predictions test_proba = clf.predict_proba(test) test_preds = test_proba[:,1] test_res=clf.predict(test) # Format for submission output = pd.DataFrame({ 'activity_id' : test_ids, 'outcome': test_preds }) output1 = pd.DataFrame({ 'activity_id' : test_ids, 'outcome': test_res }) output.head()",Yes,4,55.0 "output.to_csv('redhat.csv', index = False) output1.to_csv('redhat_noprpba.csv', index = False)",No,5,25.0 "import os import sys import operator import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory from subprocess import check_output print(check_output([""ls"", ""../input""]).decode(""utf8"")) # Any results you write to the current directory are saved as output. import xgboost as xgb import random from sklearn import model_selection, preprocessing, ensemble from sklearn.model_selection import train_test_split from sklearn.metrics import log_loss from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer ",Yes,3,8.0 "#input data
train_df=pd.read_json(\'../input/train.json\')
test_df=pd.read_json(\'../input/test.json\')
train_df.head()
#removing outliers

test_df[""bathrooms""].loc[19671] = 1.5
test_df[""bathrooms""].loc[22977] = 2.0
test_df[""bathrooms""].loc[63719] = 2.0

ulimit = np.percentile(train_df.price.values, 99)
train_df[\'price\'].ix[train_df[\'price\']>ulimit] = ulimit

ulimit = np.percentile(test_df.price.values, 99)
test_df[\'price\'].ix[test_df[\'price\']>ulimit] = ulimit

train_df[""logprice""] = np.log(train_df[""price""])
test_df[""logprice""] = np.log(test_df[""price""])

# count of ""photos""
train_df[""num_photos""] = train_df[""photos""].apply(len)
test_df[""num_photos""] = test_df[""photos""].apply(len)

train_df[""num_features""] = train_df[""features""].apply(len)
test_df[""num_features""] = test_df[""features""].apply(len)

train_df[""num_description_words""] = train_df[""description""].apply(lambda x: len(x.split("" "")))
test_df[""num_description_words""] = test_df[""description""].apply(lambda x: len(x.split("" "")))



train_df[""pos""] = train_df.longitude.round(3).astype(str) + \'_\' + train_df.latitude.round(3).astype(str)
test_df[""pos""] = test_df.longitude.round(3).astype(str) + \'_\' + test_df.latitude.round(3).astype(str)

vals = train_df[\'pos\'].value_counts()
dvals = vals.to_dict()
train_df[""density""] = train_df[\'pos\'].apply(lambda x: dvals.get(x, vals.min()))
test_df[""density""] = test_df[\'pos\'].apply(lambda x: dvals.get(x, vals.min()))


#basic features
train_df[""price_t""] =train_df[""price""]/train_df[""bedrooms""]
test_df[""price_t""] = test_df[""price""]/test_df[""bedrooms""]
train_df[""room_sum""] = train_df[""bedrooms""]+train_df[""bathrooms""]
test_df[""room_sum""] = test_df[""bedrooms""]+test_df[""bathrooms""]


train_df[\'price_per_room\'] = train_df[\'price\']/train_df[\'room_sum\']
test_df[\'price_per_room\'] = test_df[\'price\']/test_df[\'room_sum\']

image_date = pd.read_csv(\'../input/listing_image_time.csv\')

# rename columns so you can join tables later on
image_date.columns = [""listing_id"", ""time_stamp""]

# reassign the only one timestamp from April, all others from Oct/Nov
image_date.loc[80240,""time_stamp""] = 1478129766

image_date[""img_date""] = pd.to_datetime(image_date[""time_stamp""], unit=""s"")
image_date[""img_days_passed""] = (image_date[""img_date""].max() - image_date[""img_date""]).astype(""timedelta64[D]"").astype(int)
image_date[""img_date_month""] = image_date[""img_date""].dt.month
image_date[""img_date_week""] = image_date[""img_date""].dt.week
image_date[""img_date_day""] = image_date[""img_date""].dt.day
image_date[""img_date_dayofweek""] = image_date[""img_date""].dt.dayofweek
image_date[""img_date_dayofyear""] = image_date[""img_date""].dt.dayofyear
image_date[""img_date_hour""] = image_date[""img_date""].dt.hour
image_date[""img_date_monthBeginMidEnd""] = image_date[""img_date_day""].apply(lambda x: 1 if x<10 else 2 if x<20 else 3)

train_df = pd.merge(train_df, image_date, on=""listing_id"", how=""left"")
test_df = pd.merge(test_df, image_date, on=""listing_id"", how=""left"")

features_to_use=[""bathrooms"", ""bedrooms"", ""price_t"",""room_sum"",""latitude"",""longitude"",""num_photos"",""density"",""logprice"",""num_features"",""num_description_words"",""price_per_room"",""listing_id"",""img_date""]
print(train_df[\'price\'].head())
",Yes,3,8.0 "def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=321, num_rounds=818):
param = {}
param[\'objective\'] = \'multi:softprob\'
param[\'eta\'] = 0.03
param[\'max_depth\'] = 6
param[\'silent\'] = 1
param[\'num_class\'] = 3
param[\'eval_metric\'] = ""mlogloss""
param[\'min_child_weight\'] = 1
param[\'subsample\'] = 0.7
param[\'colsample_bytree\'] = 0.7
param[\'seed\'] = seed_val
num_rounds = num_rounds

plst = list(param.items())
xgtrain = xgb.DMatrix(train_X, label=train_y)

if test_y is not None:
xgtest = xgb.DMatrix(test_X, label=test_y)
watchlist = [ (xgtrain,\'train\'), (xgtest, \'test\') ]
model = xgb.train(plst, xgtrain, num_rounds, watchlist)
else:
xgtest = xgb.DMatrix(test_X)
model = xgb.train(plst, xgtrain, num_rounds)

pred_test_y = model.predict(xgtest)
return pred_test_y, model
",No,3,4.0 "index=list(range(train_df.shape[0]))
\t
random.shuffle(index)
a=[np.nan]*len(train_df)
b=[np.nan]*len(train_df)
c=[np.nan]*len(train_df)

for i in range(5):
building_level={}
for j in train_df['manager_id'].values:
building_level[j]=[0,0,0]
test_index=index[int((i*train_df.shape[0])/5):int(((i+1)*train_df.shape[0])/5)]
train_index=list(set(index).difference(test_index))
for j in train_index:
temp=train_df.iloc[j]
if temp['interest_level']=='low':
building_level[temp['manager_id']][0]+=1
if temp['interest_level']=='medium':
building_level[temp['manager_id']][1]+=1
if temp['interest_level']=='high':
building_level[temp['manager_id']][2]+=1
for j in test_index:
temp=train_df.iloc[j]
if sum(building_level[temp['manager_id']])!=0:
a[j]=building_level[temp['manager_id']][0]*1.0/sum(building_level[temp['manager_id']])
b[j]=building_level[temp['manager_id']][1]*1.0/sum(building_level[temp['manager_id']])
c[j]=building_level[temp['manager_id']][2]*1.0/sum(building_level[temp['manager_id']])


train_df['manager_level_low']=a
train_df['manager_level_medium']=b
train_df['manager_level_high']=c
",No,4,20.0 "a=[]
b=[]
c=[]
building_level={}
for j in train_df[\'manager_id\'].values:
building_level[j]=[0,0,0]
for j in range(train_df.shape[0]):
temp=train_df.iloc[j]
if temp[\'interest_level\']==\'low\':
building_level[temp[\'manager_id\']][0]+=1
if temp[\'interest_level\']==\'medium\':
building_level[temp[\'manager_id\']][1]+=1
if temp[\'interest_level\']==\'high\':
building_level[temp[\'manager_id\']][2]+=1

for i in test_df[\'manager_id\'].values:
if i not in building_level.keys():
a.append(np.nan)
b.append(np.nan)
c.append(np.nan)
else:
a.append(building_level[i][0]*1.0/sum(building_level[i]))
b.append(building_level[i][1]*1.0/sum(building_level[i]))
c.append(building_level[i][2]*1.0/sum(building_level[i]))
test_df[\'manager_level_low\']=a
test_df[\'manager_level_medium\']=b
test_df[\'manager_level_high\']=c


features_to_use.append(\'manager_level_low\')
features_to_use.append(\'manager_level_medium\')
features_to_use.append(\'manager_level_high\')
categorical = [""display_address"", ""manager_id"", ""building_id"", ""street_address""]
",No,4,20.0 for f in categorical:
\tif train_df[f].dtype=='object':
\t\t#print(f)
\t\tlbl = preprocessing.LabelEncoder()
\t\tlbl.fit(list(train_df[f].values) + list(test_df[f].values))
\t\ttrain_df[f] = lbl.transform(list(train_df[f].values))
\t\ttest_df[f] = lbl.transform(list(test_df[f].values))
\t\tfeatures_to_use.append(f)
,No,4,20.0 "train_df[\'features\'] = train_df[""features""].apply(lambda x: "" "".join([""_"".join(i.split("" "")) for i in x]))
test_df[\'features\'] = test_df[""features""].apply(lambda x: "" "".join([""_"".join(i.split("" "")) for i in x]))
",No,5,78.0 "def factorize(df1, df2, column): ps = df1[column].append(df2[column]) factors = ps.factorize()[0] df1[column] = factors[:len(df1)] df2[column] = factors[len(df1):] return df1, df2 ",No,5,20.0 "for col in ('building_id', 'display_address', 'manager_id', 'street_address'): train_df,test_df = factorize(train_df, test_df,col) ",No,5,20.0 "X_test, y_mytest = df_mytest.drop('Sales', axis=1).values, df_mytest['Sales'].values",No,5,21.0 y_hat = model.predict(X_test),No,5,48.0 from sklearn.linear_model import Ridge,No,5,22.0 "R_model = Ridge(alpha=1) R_model.fit(X_train, y_train)",Yes,4,4.0 "import os import sys import operator import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory from subprocess import check_output print(check_output([""ls"", ""../input""]).decode(""utf8"")) # Any results you write to the current directory are saved as output. import xgboost as xgb import random from sklearn import model_selection, preprocessing, ensemble from sklearn.model_selection import train_test_split from sklearn.metrics import log_loss from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer",No,5,88.0 "#input data
train_df=pd.read_json(\'../input/train.json\')
test_df=pd.read_json(\'../input/test.json\')
train_df.head()
#removing outliers

test_df[""bathrooms""].loc[19671] = 1.5
test_df[""bathrooms""].loc[22977] = 2.0
test_df[""bathrooms""].loc[63719] = 2.0

ulimit = np.percentile(train_df.price.values, 99)
train_df[\'price\'].ix[train_df[\'price\']>ulimit] = ulimit

ulimit = np.percentile(test_df.price.values, 99)
test_df[\'price\'].ix[test_df[\'price\']>ulimit] = ulimit

train_df[""logprice""] = np.log(train_df[""price""])
test_df[""logprice""] = np.log(test_df[""price""])

# count of ""photos""
train_df[""num_photos""] = train_df[""photos""].apply(len)
test_df[""num_photos""] = test_df[""photos""].apply(len)

train_df[""num_features""] = train_df[""features""].apply(len)
test_df[""num_features""] = test_df[""features""].apply(len)

train_df[""num_description_words""] = train_df[""description""].apply(lambda x: len(x.split("" "")))
test_df[""num_description_words""] = test_df[""description""].apply(lambda x: len(x.split("" "")))



train_df[""pos""] = train_df.longitude.round(3).astype(str) + \'_\' + train_df.latitude.round(3).astype(str)
test_df[""pos""] = test_df.longitude.round(3).astype(str) + \'_\' + test_df.latitude.round(3).astype(str)

vals = train_df[\'pos\'].value_counts()
dvals = vals.to_dict()
train_df[""density""] = train_df[\'pos\'].apply(lambda x: dvals.get(x, vals.min()))
test_df[""density""] = test_df[\'pos\'].apply(lambda x: dvals.get(x, vals.min()))


#basic features
train_df[""price_t""] =train_df[""price""]/train_df[""bedrooms""]
test_df[""price_t""] = test_df[""price""]/test_df[""bedrooms""]
train_df[""room_sum""] = train_df[""bedrooms""]+train_df[""bathrooms""]
test_df[""room_sum""] = test_df[""bedrooms""]+test_df[""bathrooms""]


train_df[\'price_per_room\'] = train_df[\'price\']/train_df[\'room_sum\']
test_df[\'price_per_room\'] = test_df[\'price\']/test_df[\'room_sum\']


features_to_use=[""bathrooms"", ""bedrooms"", ""price_t"",""room_sum"",""latitude"",""longitude"",""num_photos"",""density"",""logprice"",""num_features"",""num_description_words"",""price_per_room"",""listing_id""]
print(train_df[\'price\'].head())",Yes,4,8.0 for f in categorical:
\tif train_df[f].dtype=='object':
\t\t#print(f)
\t\tlbl = preprocessing.LabelEncoder()
\t\tlbl.fit(list(train_df[f].values) + list(test_df[f].values))
\t\ttrain_df[f] = lbl.transform(list(train_df[f].values))
\t\ttest_df[f] = lbl.transform(list(test_df[f].values))
\t\tfeatures_to_use.append(f),No,5,20.0 "train_df[\'features\'] = train_df[""features""].apply(lambda x: "" "".join([""_"".join(i.split("" "")) for i in x]))
test_df[\'features\'] = test_df[""features""].apply(lambda x: "" "".join([""_"".join(i.split("" "")) for i in x]))",No,5,78.0 print(train_df[features_to_use].head()),No,5,41.0 "train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()",No,5,11.0 "test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()",No,5,11.0 "target_num_map = {'high':0, 'medium':1, 'low':2} train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x])) print(train_X.shape, test_X.shape)",Yes,4,21.0 "preds, model = runXGB(train_X, train_y, test_X, num_rounds=1000) out_df = pd.DataFrame(preds) out_df.columns = [""high"", ""medium"", ""low""] out_df[""listing_id""] = test_df.listing_id.values out_df.to_csv(""xgb_starter2.csv"", index=False)",Yes,3,48.0 "data = pd.read_csv('../input/train.csv', sep=',', low_memory=False) data['StateHoliday'] = data['StateHoliday'].apply(lambda x: str(x))",Yes,3,45.0 "grouppedByStoreDayPromo = data[data['Sales'] > 0].groupby(by=['Store', 'DayOfWeek', 'Promo'])",No,5,60.0 "test = pd.read_csv('../input/test.csv', sep=',')",No,5,45.0 "mn = data[data['Sales'] > 0].groupby(['Store', 'DayOfWeek', 'Promo'])['Sales'].mean().reset_index()",No,5,60.0 "res[['Id', 'Sales']].to_csv('result_mean.csv', sep=',', index=None)",No,5,25.0 "import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np import matplotlib.dates import datetime %matplotlib inline",No,5,23.0 "df_train = pd.read_csv(""../input/train.csv"") df_store = pd.read_csv(""../input/store.csv"") df_test = pd.read_csv(""../input/test.csv"") df_train.head()",No,5,45.0 "closed_store_ids = df_test[""Id""][df_test[""Open""] == 0].values closed_store_ids",No,4,41.0 "df_train['Year'] = df_train['Date'].apply(lambda x: int(x[:4])) df_train['Month'] = df_train['Date'].apply(lambda x: int(x[5:7])) df_train['Day'] = df_train['Date'].apply(lambda x: int(x[8:]))",No,5,8.0 "fig, (axis1) = plt.subplots(1,1,figsize=(15,4)) sns.countplot(x = 'Open', hue = 'DayOfWeek', data = df_train,)",No,5,33.0 "for temp_year in range (2013,2016):
df_train1_temp = df_train[df_train.Year == temp_year]
average_daily_sales = df_train1_temp.groupby(\'Date\')[""Sales""].mean()
fig = plt.subplots(1,1,sharex=True,figsize=(25,8))
average_daily_sales.plot(title=""Average Daily Sales"")",No,5,33.0 "average_monthly_sales = df_train.groupby(\'Month\')[""Sales""].mean()
fig = plt.subplots(1,1,sharex=True,figsize=(10,5))
average_monthly_sales.plot(legend=True,marker=\'o\',title=""Average Sales"")",No,5,33.0 df_train.StateHoliday.unique(),No,5,57.0 "df_train['StateHoliday'] = df_train['StateHoliday'].replace(0, '0') df_train.StateHoliday.unique()",Yes,3,16.0 "sns.factorplot(x =""Year"", y =""Sales"", hue =""Promo"", data = df_train, size = 5, kind =""box"", palette =""muted"") sns.factorplot(x =""Year"", y =""Sales"", hue =""SchoolHoliday"", data = df_train, size = 5, kind =""box"", palette =""muted"") sns.factorplot(x =""Year"", y =""Sales"", hue =""HolidayBin"", data = df_train, size = 5, kind =""box"", palette =""muted"")",No,5,75.0 "sns.factorplot(x =""Year"", y =""Sales"", hue =""StateHoliday"", data = df_train, size = 6, kind =""bar"", palette =""muted"")",No,5,75.0 "sns.factorplot(x =""Month"", y =""Sales"", hue =""HolidayBin"", data = df_train, size = 6, kind =""bar"", palette =""muted"")",No,5,75.0 "sns.factorplot(x=""DayOfWeek"", y=""Customers"", hue=""HolidayBin"", col=""Promo"", data=df_train, capsize=.2, palette=""YlGnBu_d"", size=6, aspect=.75)",No,5,75.0 "sns.factorplot(x=""DayOfWeek"", y=""Customers"", hue=""SchoolHoliday"", col=""Promo"", data=df_train, capsize=.2, palette=""YlGnBu_d"", size=6, aspect=.75)",No,5,75.0 sns.distplot(df_train.Sales),No,5,33.0 df_store.head(),No,5,41.0 "total_sales_customers = df_train.groupby('Store')['Sales', 'Customers'].sum() total_sales_customers.head()",Yes,3,60.0 "df_total_sales_customers = pd.DataFrame({'Sales': total_sales_customers['Sales'], 'Customers': total_sales_customers['Customers']}, index = total_sales_customers.index) df_total_sales_customers = df_total_sales_customers.reset_index() df_total_sales_customers.head()",Yes,3,12.0 "avg_sales_customers = df_train.groupby('Store')['Sales', 'Customers'].mean() avg_sales_customers.head()",Yes,4,60.0 "df_avg_sales_customers = pd.DataFrame({'Sales': avg_sales_customers['Sales'], 'Customers': avg_sales_customers['Customers']}, index = avg_sales_customers.index) df_avg_sales_customers = df_avg_sales_customers.reset_index() df_stores_avg = df_avg_sales_customers.join(df_store.set_index('Store'), on='Store') df_stores_avg.head()",Yes,3,12.0 "df_stores_new = df_total_sales_customers.join(df_store.set_index('Store'), on='Store') df_stores_new.head()",Yes,4,32.0 "average_storetype = df_stores_new.groupby('StoreType')['Sales', 'Customers', 'CompetitionDistance'].mean() fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,4)) sns.barplot(average_storetype.index, average_storetype['Sales'], ax=axis1) sns.barplot(average_storetype.index, average_storetype['Customers'], ax=axis2) sns.barplot(average_storetype.index, average_storetype['CompetitionDistance'], ax=axis3) average_storetype.index",Yes,4,33.0 "average_assortment = df_stores_new.groupby('Assortment')['Sales', 'Customers'].mean() fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4)) sns.barplot(average_assortment.index, average_assortment['Sales'], ax=axis1) sns.barplot(average_assortment.index, average_assortment['Customers'], ax=axis2)",Yes,4,33.0 "df_test[\'Year\'] = df_test[\'Date\'].apply(lambda x: int(x[:4]))
df_test[\'Month\'] = df_test[\'Date\'].apply(lambda x: int(x[5:7]))
df_test[\'Day\'] = df_test[\'Date\'].apply(lambda x: int(x[8:]))
df_test[""HolidayBin""] = df_test.StateHoliday.map({""0"": 0, ""a"": 1, ""b"": 1, ""c"": 1})
del df_test[\'Date\']
del df_test[\'StateHoliday\']
df_test.head()",Yes,3,16.0 df_train.head(),No,5,41.0 "from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import GridSearchCV train_stores = dict(list(df_train.groupby('Store'))) test_stores = dict(list(df_test.groupby('Store')))",Yes,4,22.0 "best_list_max_depth = []
best_list_n_estimators = []

for i in test_stores:
store = train_stores[i]
X_train = store.drop([""Sales"", ""Store"", ""Customers""],axis=1)
Y_train = store[""Sales""]
X_test = test_stores[i].copy()


store_ids = X_test[""Id""]
X_test.drop([""Id"",""Store""], axis=1,inplace=True)
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_train.mean())

estimator = RandomForestRegressor(random_state=123, criterion = \'mse\')
params = {\'n_estimators\': range(5, 20), \'max_depth\': range(5, 25)}
grid = GridSearchCV(estimator, params).fit(X_train, Y_train)
best_list_max_depth.append(grid.best_params_[\'max_depth\'])
best_list_n_estimators.append(grid.best_params_[\'n_estimators\'])
print (""score"", grid.best_score_)
print (""params"", grid.best_params_)",Yes,4,2.0 "res_max_depth = round(np.array(best_list_max_depth).mean()) res_n_estimators = round(np.array(best_list_n_estimators).mean())",No,5,2.0 "best_max_depth = round(np.array(best_list_max_depth).mean()) best_n_estimators = round(np.array(best_list_n_estimators).mean()) print(best_max_depth) print(best_n_estimators)",No,5,2.0 "import time import datetime import matplotlib.pyplot as plt %matplotlib inline ",No,5,23.0 "trainData = pd.read_csv('../input/train.csv', low_memory=False) trainData ",No,5,45.0 "supply = pd.read_csv('../input/store.csv') supply.set_index('Store', inplace=True) supply ",No,5,45.0 "trainData ",No,5,41.0 "#: 2 . customers, sales
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import validation_curve
",No,5,22.0 "X_train, customers_train, sales_train = trainData.drop(['Sales', 'Customers'], axis=1).values, trainData['Customers'].values, trainData['Sales'].values ",No,5,21.0 "X1_train = np.concatenate((X_train, np.reshape(customers_train, (customers_train.shape[0], 1))), axis=1) ",No,5,11.0 "forest1 = RandomForestRegressor() forest2 = RandomForestRegressor() forest1.fit(X_train, customers_train) forest2.fit(X1_train, sales_train) ",Yes,3,7.0 #
testData = pd.read_csv('../input/test.csv')
,No,5,45.0 "testData['UnixTime'] = testData['Date'].map(toUnixTime) testData.drop('Date', axis=1, inplace=True) ",Yes,3,8.0 "testData.StateHoliday.replace({'a': 1, 'b': 2, 'c': 3, '0': 0}, inplace=True) unixTime = testData['UnixTime'].values testData['UnixTime'] = (unixTime - np.mean(unixTime))/np.std(unixTime) ",No,4,18.0 "testData ",No,5,41.0 "nanToMedian(testData) X_test = testData.drop('Id', axis=1).values ",Yes,3,10.0 "customers_test = forest1.predict(X_test) X1_test = np.concatenate((X_test, np.reshape(customers_test, (customers_test.shape[0], 1))), axis=1) sales_predicted = forest2.predict(X1_test) ",Yes,3,32.0 "Result = pd.DataFrame(testData.Id) Result['Sales'] = sales_predicted Result.set_index('Id') Result.to_csv('./result.csv', index=False) ",No,5,25.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline",No,5,23.0 "df_store = pd.read_csv('../input/store.csv', index_col=None)",No,5,45.0 df_store.info(),No,5,40.0 "df_train = pd.read_csv('../input/train.csv', index_col=None, low_memory=False)",No,5,45.0 "y_hat_R = R_model.predict(X_test) print(eval_rmspe(y_hat_R, y_mytest))",Yes,4,48.0 "print(df_train.shape) df_train.head()",Yes,3,58.0 "#,
from sklearn.ensemble import RandomForestRegressor",No,5,22.0 df_train.info(),No,5,40.0 "df_train.replace({'StateHoliday': {0: '0'}}, inplace=True) df_train.StateHoliday.unique()",Yes,3,57.0 df_train.DayOfWeek = df_train.DayOfWeek.astype(str) # for dictvectorizer,No,5,16.0 "#
X_train_full, y_train_full = df_data.drop('Sales', axis=1).values, df_data['Sales'].values",No,5,21.0 "X_test_full = df_test.drop('Id', axis=1).values",Yes,4,10.0 "# ,
forest = RandomForestRegressor(n_jobs=1, n_estimators=150, max_features=7, max_depth=100)
forest.fit(X_train_full, y_train_full)",Yes,4,4.0 y_hat_full = forest.predict(X_test_full),No,5,48.0 "submission = submission.append(pd.Series(y_hat_full, index=store_ids))
submission = pd.DataFrame({ ""Id"": submission.index, ""Sales"": submission.values})
submission.to_csv(\'submission.csv\', index=False)",Yes,3,12.0 "df = df_train[df_train.Open != 0].merge(df_store, on='Store').fillna(1) df.drop(['Store', 'Date', 'Customers'], axis=1, inplace=True)",Yes,3,17.0 "df.shape, df.columns",Yes,3,58.0 y_train = df.Sales.values,No,5,21.0 "print(X_train.shape, y_train.shape)",No,5,58.0 "from sklearn.ensemble import RandomForestRegressor rgr = RandomForestRegressor(n_estimators=25, verbose=True, n_jobs=8) rgr.fit(X_train, y_train) print(rgr.score(X_train, y_train))",Yes,3,4.0 "df_test = pd.read_csv('../input/test.csv', index_col=None)",No,5,45.0 "print(df_test.shape) df_test.head()",Yes,3,58.0 "features_to_use = [""bathrooms"", ""bedrooms"", ""latitude"", ""longitude"", ""price""]",No,5,77.0 "df_test.replace({'StateHoliday': {0: '0'}}, inplace=True) df_test.StateHoliday.unique()",Yes,3,57.0 "df_pred = df_test[df_test.Open != 0].merge(df_store, on='Store').fillna(1) df_pred.drop(['Id', 'Store', 'Date'], axis=1, inplace=True)",Yes,3,17.0 df_pred.shape,No,5,58.0 X_test.shape,No,5,58.0 rgr.predict(X_test)[:10],No,5,48.0 "df_test.loc[df_test.Open != 0,'Sales'] = rgr.predict(X_test) df_test.loc[df_test.Open == 0, 'Sales'] = 0",No,5,48.0 "out = pd.DataFrame({
""Id"": df_test.Id,
""Sales"": df_test.Sales.values
})
out.to_csv(\'submission.csv\', index=False)",No,5,25.0 "from mlbox.preprocessing import * from mlbox.optimisation import * from mlbox.prediction import *",No,5,22.0 "paths = [""../input/train.json"", ""../input/test.json""] target_name = ""interest_level""",No,5,77.0 "rd = Reader() df = rd.train_test_split(paths, target_name)",No,5,13.0 "df[""train""].head()",No,5,41.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import os

import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator

import numpy as np
import pandas as pd

os.environ[\'http_proxy\'] = \'\'
os.environ[\'https_proxy\'] = \'\'
os.environ[\'NO_PROXY\'] = \'localhost\'

def transform(df_):
""""""
transform Date to datetime type.
""""""
df_[\'Date\']=pd.to_datetime(df_[\'Date\'])
df_[\'month\']=df_[\'Date\'].dt.month
df_[\'year\']=df_[\'Date\'].dt.year
df_[\'Store\']=pd.to_numeric(df_[\'Store\'])
return df_

store=pd.read_csv(""../input/store.csv"")
train=pd.read_csv(""../input/train.csv"")
test=pd.read_csv(""../input/test.csv"")
# filter sales invalid column
train=train[train[\'Sales\'] > 0]
# merge train and test with store
train=pd.merge(train, store, on=[\'Store\'])
test=pd.merge(test, store, on=[\'Store\'])
# transform
train=transform(train)
test=transform(test)
# train convert to logSales from log(Sales)
train[\'logSales\']=pd.to_numeric(np.log(train[\'Sales\']))
test[\'logSales\']=0

# initialization of h2o
h2o.init(nthreads=-1, max_mem_size = ""8G"")
train_hf = h2o.H2OFrame(train)
test_hf = h2o.H2OFrame(test)
rf_v1_model = H2ORandomForestEstimator(model_id=""rf_covType_v1"", ntrees=200, stopping_rounds=2, max_depth = 30, nbins_cats = 1115, score_each_iteration=True, seed=1000000)

# Training prepare for model
covtype_X=[col for col in train_hf.columns if col not in [""Id"",""Date"",""Sales"",""logSales"",""Customers""]]
covtype_y=train_hf.columns[-1]
rf_v1_model.train(x=covtype_X, y=covtype_y, training_frame=train_hf)
test_result_hf = rf_v1_model.predict(test_hf)
test_result_df = test_result_hf.as_data_frame()
test_result_df[\'predict\']=np.exp(test_result_df[\'predict\'])
test_result_df.rename(columns={\'predict\': \'Sales\'}, inplace=True)
test_result_df.insert(loc=0, column=\'Id\', value=test[\'Id\'])
test_result_df.set_index(\'Id\')
test_result_df.to_csv(\'python_h2o_rf.csv\', header=True, index=False)",Yes,4,22.0 "import numpy as np import pandas as pd",No,5,22.0 "brats = pd.read_csv(""../input/child_wishlist_v2.csv"", header=None, index_col=0).as_matrix() gifts = pd.read_csv(""../input/gift_goodkids_v2.csv"", header=None, index_col=0).as_matrix()",No,5,45.0 "TRIPS_COUNT = 5_001 TWINS_COUNT = 40_000 TWINS_START = TRIPS_COUNT TWINS_END = TWINS_START + TWINS_COUNT GIFTS_LIMIT = 1_000 BRAT_PREF_COUNT = brats.shape[1] GIFT_PREF_COUNT = gifts.shape[1]",No,5,77.0 "submit = pd.read_csv(""save/""+target_name+""_predictions.csv"")[[""high"", ""medium"", ""low""]] submit[""listing_id""] = df[""test""].listing_id.astype(int).values submit.to_csv(""mlbox.csv"", index=False)",Yes,3,45.0 "print(""Quantity of Gifts:"") pd.Series(gift_cnt).value_counts()",No,5,72.0 "import os, operator, math import pandas as pd import numpy as np import datetime as dt from tqdm import tqdm import matplotlib.pyplot as plt from scipy.optimize import linear_sum_assignment from collections import defaultdict, Counter",Yes,3,22.0 "scores = pd.DataFrame(rows) scores.head()",Yes,4,12.0 "submission = pd.DataFrame({ ""ChildId"": range(len(brats)), ""GiftId"": assigned })",No,5,12.0 "child_data = pd.read_csv('../input/santa-gift-matching/child_wishlist_v2.csv', header=None).drop(0, 1).values gift_data = pd.read_csv('../input/santa-gift-matching/gift_goodkids_v2.csv', header=None).drop(0, 1).values n_children = 1000000 n_gift_type = 1000 n_gift_quantity = 1000 n_child_wish = 100 triplets = 5001 twins = 40000 tts = triplets + twins ",Yes,3,77.0 "submission.to_csv(""greedy_v2.csv"", index=False)",No,5,25.0 "import os, cv2, re, random import numpy as np import pandas as pd from keras.preprocessing.image import ImageDataGenerator from keras.preprocessing.image import img_to_array, load_img from keras import layers, models, optimizers from keras import backend as K from sklearn.model_selection import train_test_split",No,5,22.0 "initial_sub = '../input/max-flow-with-min-cost-v2-0-9267/subm_0.926447635166.csv' subm = pd.read_csv(initial_sub) subm['gift_rank'] = subm.groupby('GiftId').rank() - 1 subm['gift_id'] = subm['GiftId'] * 1000 + subm['gift_rank'] subm['gift_id'] = subm['gift_id'].astype(np.int64) current_gift_ids = subm['gift_id'].values",Yes,3,8.0 "import os, cv2, itertools import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline",No,5,23.0 "train_images_dogs_cats.sort(key=natural_keys) train_images_dogs_cats = train_images_dogs_cats[0:1300] + train_images_dogs_cats[12500:13800] test_images_dogs_cats.sort(key=natural_keys)",Yes,4,9.0 "def prepare_data(list_of_images):
""""""
Returns two arrays:
x is an array of resized images
y is an array of labels
""""""
x = [] # images as arrays
y = [] # labels

for image in list_of_images:
x.append(cv2.resize(cv2.imread(image), (img_width,img_height), interpolation=cv2.INTER_CUBIC))

for i in list_of_images:
if \'dog\' in i:
y.append(1)
elif \'cat\' in i:
y.append(0)
#else:
#print(\'neither cat nor dog name present in images\')

return x, y",No,5,21.0 "from sklearn.linear_model import LogisticRegressionCV from sklearn.metrics import confusion_matrix clf = LogisticRegressionCV() X_train_lr, y_train_lr = X_train.T, y_train.T.ravel() clf.fit(X_train_lr, y_train_lr)",Yes,3,22.0 "X, Y = prepare_data(train_images_dogs_cats) print(K.image_data_format())",Yes,4,21.0 "# First split the data in two sets, 80% for training, 20% for Val/Test) X_train, X_val, Y_train, Y_val = train_test_split(X,Y, test_size=0.2, random_state=1)",No,5,13.0 "submission = pd.DataFrame(np.hstack([test_idx, clf.predict_proba(X_test_lr)]), columns=['id', 'cat', 'dog'])",No,5,12.0 "nb_train_samples = len(X_train) nb_validation_samples = len(X_val) batch_size = 16",No,5,77.0 "model = models.Sequential() model.add(layers.Conv2D(32, (3, 3), input_shape=(img_width, img_height, 3))) model.add(layers.Activation('relu')) model.add(layers.MaxPooling2D(pool_size=(2, 2))) model.add(layers.Conv2D(32, (3, 3))) model.add(layers.Activation('relu')) model.add(layers.MaxPooling2D(pool_size=(2, 2))) model.add(layers.Conv2D(64, (3, 3))) model.add(layers.Activation('relu')) model.add(layers.MaxPooling2D(pool_size=(2, 2))) model.add(layers.Flatten()) model.add(layers.Dense(64)) model.add(layers.Activation('relu')) model.add(layers.Dropout(0.5)) model.add(layers.Dense(1)) model.add(layers.Activation('sigmoid')) model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) model.summary()",No,5,4.0 "submission = submission.drop([\'cat\'], axis=1)
submission = submission.rename(index=str, columns={""dog"": ""label""})
submission[\'id\'] = submission[\'id\'].astype(int)
submission.sort_values(\'id\', inplace=True)",Yes,3,61.0 submission.head(),No,5,41.0 "submission.to_csv('STahamtan_Dog_vs_Cat_Submission.csv', index=False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",No,5,88.0 "train = pd.read_json(""../input/train.json"") test = pd.read_json(""../input/test.json"")",No,5,44.0 test.info(),No,5,40.0 "history = model.fit_generator( train_generator, steps_per_epoch=nb_train_samples // batch_size, epochs=30, validation_data=validation_generator, validation_steps=nb_validation_samples // batch_size ) ",No,4,7.0 "model.save_weights('model_wieghts.h5') model.save('model_keras.h5')",Yes,4,50.0 train['interest_level'].value_counts(),No,5,72.0 "facet = sns.FacetGrid(train, hue = ""interest_level"", aspect=4)
facet.map(sns.kdeplot, \'bathrooms\', shade=True)
facet.add_legend()
plt.show()",No,5,33.0 "facet = sns.FacetGrid(train, hue = ""interest_level"", aspect=4)
facet.map(sns.kdeplot, \'bathrooms\', shade=True)
facet.set(xlim=(0,2))
facet.add_legend()
plt.show()",No,5,33.0 "facet = sns.FacetGrid(train, hue = ""interest_level"", aspect=4)
facet.map(sns.kdeplot, \'bathrooms\', shade=True)
facet.set(xlim=(2,6))
facet.add_legend()
plt.show()",No,5,33.0 "for dataset in train_test: dataset.loc[ dataset['bathrooms'] <= 2, 'bathrooms'] = 2, dataset.loc[(dataset['bathrooms'] > 2) & (dataset['bathrooms'] <= 4), 'bathrooms'] = 1, dataset.loc[ dataset['bathrooms'] > 4, 'bathrooms'] = 0 ",No,5,8.0 "facet = sns.FacetGrid(train, hue = ""interest_level"", aspect=4)
facet.map(sns.kdeplot, \'bedrooms\', shade=True)
facet.add_legend()
plt.show()",No,5,33.0 "for dataset in train_test: dataset.loc[ dataset['bedrooms'] <= 2, 'bedrooms'] = 0, dataset.loc[(dataset['bedrooms'] > 2) & (dataset['bedrooms'] <= 4), 'bedrooms'] = 1, dataset.loc[ dataset['bedrooms'] > 4, 'bedrooms'] = 2",No,5,8.0 "counter = range(1, len(test_images_dogs_cats) + 1)
solution = pd.DataFrame({""id"": counter, ""label"":list(prediction_probabilities)})
cols = [\'label\']

for col in cols:
solution[col] = solution[col].map(lambda x: str(x).lstrip(\'[\').rstrip(\']\')).astype(float)

solution.to_csv(""dogsVScats.csv"", index = False)",Yes,3,12.0 "import pandas as pd import numpy as np from sklearn import ensemble, preprocessing",No,5,22.0 "# Load dataset train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') sample = pd.read_csv('../input/sampleSubmission.csv') weather = pd.read_csv('../input/weather.csv')",No,5,45.0 sum(train['building_id']=='0'),No,5,72.0 "# Not using codesum for this benchmark weather = weather.drop('CodeSum', axis=1)",No,5,10.0 "train[""created""] = pd.to_datetime(train[""created""]) train[""month_created""] = train[""created""].dt.month",Yes,3,16.0 "train[""month_created""]",No,5,41.0 train['month_created'].value_counts(),No,5,72.0 "# Split station 1 and 2 and join horizontally weather_stn1 = weather[weather['Station']==1] weather_stn2 = weather[weather['Station']==2] weather_stn1 = weather_stn1.drop('Station', axis=1) weather_stn2 = weather_stn2.drop('Station', axis=1) weather = weather_stn1.merge(weather_stn2, on='Date')",Yes,3,14.0 "def bar_chart(feature):
low = train[train['interest_level']=='low'][feature].value_counts() # survived
medium = train[train['interest_level']=='medium'][feature].value_counts()
high = train[train['interest_level']=='high'][feature].value_counts()
df = pd.DataFrame([low, medium, high])
df.index = ['low','medium','high']
df.plot(kind='bar',stacked=True, figsize=(10,5))",Yes,3,33.0 "train[""created""] = pd.to_datetime(train[""created""])
train[""date_created""] = train[""created""].dt.date
cnt_srs = train[\'date_created\'].value_counts()

plt.figure(figsize=(12,4))
ax = plt.subplot(111)
ax.bar(cnt_srs.index, cnt_srs.values, alpha=0.8)
ax.xaxis_date()
plt.xticks(rotation=\'vertical\')
plt.show()",Yes,4,33.0 "train['month'] = train.Date.apply(create_month) train['day'] = train.Date.apply(create_day) test['month'] = test.Date.apply(create_month) test['day'] = test.Date.apply(create_day)",No,5,8.0 "# Add integer latitude/longitude columns train['Lat_int'] = train.Latitude.apply(int) train['Long_int'] = train.Longitude.apply(int) test['Lat_int'] = test.Latitude.apply(int) test['Long_int'] = test.Longitude.apply(int)",No,5,16.0 train['day_of_week'] = train['created'].dt.weekday,No,5,8.0 "# drop address columns train = train.drop(['Address', 'AddressNumberAndStreet','WnvPresent', 'NumMosquitos'], axis = 1) test = test.drop(['Id', 'Address', 'AddressNumberAndStreet'], axis = 1)",No,5,10.0 "test[""created""] = pd.to_datetime(test[""created""])
test[\'day_of_week\'] = test[\'created\'].dt.weekday",Yes,3,8.0 train['created_day'] = train['created'].dt.day,No,5,8.0 test['created_day'] = test['created'].dt.day,No,5,8.0 "### Iterest per Day of Week
fig = plt.figure(figsize=(12,6))
sns.countplot(x=""created_day"", hue=""interest_level"", hue_order=[\'low\', \'medium\', \'high\'], data=train);
plt.xlabel(\'created_day\');
plt.ylabel(\'Number of occurrences\');",No,5,75.0 "# Merge with weather data train = train.merge(weather, on='Date') test = test.merge(weather, on='Date') train = train.drop(['Date'], axis = 1) test = test.drop(['Date'], axis = 1)",Yes,3,32.0 "# Convert categorical data to numbers lbl = preprocessing.LabelEncoder() lbl.fit(list(train['Species'].values) + list(test['Species'].values)) train['Species'] = lbl.transform(train['Species'].values) test['Species'] = lbl.transform(test['Species'].values)",No,5,20.0 "train[""num_features""] = train[""features""].apply(len) test[""num_features""] = test[""features""].apply(len)",No,5,8.0 "llimit = np.percentile(train.latitude.values, 1) ulimit = np.percentile(train.latitude.values, 99) train['latitude'].ix[train['latitude']ulimit] = ulimit plt.figure(figsize=(8,6)) sns.distplot(train.latitude.values, bins=50, kde=False) plt.xlabel('latitude', fontsize=12) plt.show()",Yes,3,33.0 "llimit = np.percentile(train.longitude.values, 1) ulimit = np.percentile(train.longitude.values, 99) train['longitude'].ix[train['longitude']ulimit] = ulimit plt.figure(figsize=(8,6)) sns.distplot(train.longitude.values, bins=50, kde=False) plt.xlabel('longitude', fontsize=12) plt.show()",Yes,3,33.0 "lbl.fit(list(train['Street'].values) + list(test['Street'].values)) train['Street'] = lbl.transform(train['Street'].values) test['Street'] = lbl.transform(test['Street'].values)",No,5,20.0 train['price'],No,5,41.0 "lbl.fit(list(train['Trap'].values) + list(test['Trap'].values)) train['Trap'] = lbl.transform(train['Trap'].values) test['Trap'] = lbl.transform(test['Trap'].values)",No,5,20.0 "facet = sns.FacetGrid(train, hue = ""interest_level"", aspect=4)
facet.map(sns.kdeplot, \'price\', shade=True)
facet.add_legend()
plt.show()",No,5,33.0 "facet = sns.FacetGrid(train, hue = ""interest_level"", aspect=4)
facet.map(sns.kdeplot, \'price\', shade=True)
facet.set(xlim=(0,100000))
facet.add_legend()
plt.show()",No,5,33.0 "facet = sns.FacetGrid(train, hue = ""interest_level"", aspect=4)
facet.map(sns.kdeplot, \'price\', shade=True)
facet.set(xlim=(100000,200000))
facet.add_legend()
plt.show()",No,5,33.0 "# drop columns with -1s train = train.ix[:,(train != -1).any(axis=0)] test = test.ix[:,(test != -1).any(axis=0)]",No,4,10.0 "facet = sns.FacetGrid(train, hue = ""interest_level"", aspect=4)
facet.map(sns.kdeplot, \'price\', shade=True)
facet.set(xlim=(0,10000))
facet.add_legend()
plt.show()",No,5,33.0 "# Random Forest Classifier clf = ensemble.RandomForestClassifier(n_jobs=1, n_estimators=1000, min_samples_split= 2) clf.fit(train, labels)",No,5,7.0 "features_drop = ['building_id', 'created', 'description', 'display_address', 'features', 'manager_id', 'photos', 'street_address', 'month_created', 'date_created'] train1 = train.drop(features_drop, axis=1)",No,5,10.0 "features_drop = ['building_id', 'created', 'description', 'display_address', 'features', 'manager_id', 'photos', 'street_address'] test1 = test.drop(features_drop, axis=1)",No,5,10.0 "X = train[['bathrooms','bedrooms','latitude','longitude','price','day_of_week','created_day','num_features']] y = train1['interest_level']",No,5,21.0 "from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import log_loss X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33)",Yes,4,22.0 "X = test[['bathrooms','bedrooms','latitude','longitude','price','day_of_week','created_day','num_features']] y = clf.predict_proba(X)",Yes,3,21.0 "t = 732 m=8 n=8",No,5,77.0 "df = pd.read_csv('../input/data.txt',skiprows=2,sep=' ',names=list(map(str,(list(range(n))))))",No,5,45.0 "sub = pd.DataFrame() sub[""listing_id""] = test[""listing_id""] for label in [""high"", ""medium"", ""low""]: sub[label] = y[:, labels2idx[label]] sub.to_csv(""submission_rf.csv"", index=False)",Yes,3,45.0 "from sklearn.feature_extraction.text import TfidfVectorizer from nltk.corpus import stopwords from nltk import word_tokenize import pandas as pd import numpy as np import re from sklearn.model_selection import train_test_split from collections import defaultdict from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import LabelEncoder from sklearn.metrics import accuracy_score, classification_report import json",No,5,22.0 "with open('../input/train.json') as f: raw_train = json.load(f) with open('../input/test.json') as f: raw_test = json.load(f)",No,5,44.0 softmax = LogisticRegression(**m_params),No,5,4.0 "test_x = [j for i in sorted(val.keys()) for j in val[i]] true = [i for i in sorted(val.keys()) for j in val[i]]",No,5,53.0 pred = softmax.predict(tfidf.transform(test_x)),No,5,48.0 "accuracy_score(true, pred)",No,5,49.0 "lab = LabelEncoder() c_true = lab.fit_transform(true) c_pred = lab.transform(pred) print(classification_report(c_true, c_pred, target_names=lab.classes_, digits=5))",No,4,20.0 "sub_df.to_csv('softmax_reg.csv', index=False)",No,5,25.0 "import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import warnings % matplotlib inline from datetime import datetime from scipy import stats pd.options.mode.chained_assignment = None from scipy.stats import norm, skew warnings.filterwarnings(""ignore"", category=DeprecationWarning) from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression,Ridge,Lasso from sklearn.model_selection import GridSearchCV from sklearn import metrics import warnings",No,5,23.0 "train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv')",No,5,45.0 test.head(),No,5,41.0 train.describe(),No,5,40.0 "print('Train Dataset Shape : {0}'.format(train.shape)) print('Test Dataset Shape : {0}'.format(test.shape))",No,5,58.0 train.dtypes,No,5,70.0 sns.boxplot(train['count']),No,5,33.0 new_df = pd.DataFrame(new_df),No,5,12.0 "train = train[np.abs(train[""count""]-train[""count""].mean())<=(3*train[""count""].std())] ",Yes,4,14.0 "fig,ax = plt.subplots(2,1,figsize = (10,10))
sns.distplot(train[\'count\'],ax=ax[0])
stats.probplot(train[""count""], dist=\'norm\', fit=True, plot=ax[1])
print(\'Skewness : {0}\'.format(train[\'count\'].skew()))
print(\'Kurt : {0}\'.format(train[\'count\'].kurt()))",No,5,33.0 new_df['m'] = new_df['t'].apply(lambda x : int(x)%8),No,5,8.0 "fig,ax = plt.subplots(2,1,figsize = (10,10))
#logcount = np.log1p(train[\'count\']).kurt()
#rootcount = np.sqrt(train[\'count\']).kurt()
#cubiccount = np.power(train[\'count\'],2).kurt()
#minVal = min([logcount, rootcount, cubiccount])
#if logcount == minVal:
best = \'log\'
train[\'count_log\'] = np.log1p(train[\'count\'])
sns.distplot(train[\'count_log\'],ax=ax[0])
stats.probplot(train[""count_log""], dist=\'norm\', fit=True, plot=ax[1])
#elif rootcount == minVal:
#best = \'root\'
#train[\'count_root\'] = np.sqrt(train[\'count\'])
#sns.distplot(train[\'count_root\'],ax=ax[0])
#stats.probplot(train[""count_root""], dist=\'norm\', fit=True, plot=ax[1])
#elif cubiccount == minVal:
#best = \'cubic\'
#train[\'count_cubic\'] = np.power(train[\'count\'],2)
#sns.distplot(train[\'count_cubic\'],ax=ax[0])
#stats.probplot(train[""count_cubic""], dist=\'norm\', fit=True, plot=ax[1])
#print(\'For count, the Best TF is \' + best)",No,5,33.0 new_df['t'] = new_df['t'].apply(lambda x:x//8),No,5,8.0 new_df['hour'] = new_df['t'].apply(lambda x : x%24 ),No,5,8.0 "train['date'] = train.datetime.apply(lambda x: x.split()[0]) train['hour'] = train.datetime.apply(lambda x: x.split()[1].split(':')[0]) train['weekday'] = train.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').weekday()) train['month'] = train.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').month) train = train.drop('datetime',axis=1)",No,4,8.0 new_df['day'] = new_df['t'].apply(lambda x : x//24 ),No,5,8.0 train.shape,No,5,58.0 "categorical = [\'date\',\'weekday\',\'month\',\'hour\',\'season\',\'holiday\',\'workingday\',\'weather\']
numeric = [""temp"",""atemp"",""casual"",""registered"",""humidity"",""windspeed"",""count"",""count_log""]",No,5,77.0 "new_df['mm']=new_df['m'] new_df['nn']= new_df['n']",No,5,8.0 new_df['id'] = new_df['t'].map(str)+':'+new_df['mm'].map(str)+':'+new_df['nn'].map(str),No,5,8.0 "for idx in categorical: train[idx].astype('category')",Yes,4,16.0 "fig,axes = plt.subplots(ncols=2 ,nrows=2) fig.set_size_inches(15,10) sns.boxplot(data=train,x='season',y='count',ax=axes[0][0]) sns.boxplot(data=train,x='holiday',y='count',ax=axes[0][1]) sns.boxplot(data=train,x='workingday',y='count',ax=axes[1][0]) sns.boxplot(data=train,x='weather',y='count',ax=axes[1][1]) fig1,axes1 = plt.subplots() fig1.set_size_inches(15,10) sns.boxplot(data=train,x='hour',y='count')",No,5,33.0 "plt.subplots(figsize=(15,8)) sns.heatmap(train[numeric].corr(),annot=True)",No,5,80.0 "corr = train[numeric].drop(\'count\', axis=1).corr()
corr =corr.drop(\'count_log\', axis=1).corr() # We already examined SalePrice correlations
plt.figure(figsize=(12, 10))
sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)],
cmap=\'viridis\', vmax=1.0, vmin=-1.0, linewidths=0.1,
annot=True, annot_kws={""size"": 8}, square=True);",No,5,80.0 new_df = df_r.set_index('id').join(new_df.set_index('id')),No,4,32.0 "### count,month plt.figure(figsize=(15,8)) monthagg = pd.DataFrame(train.groupby('month')['count'].mean()).reset_index() sns.barplot(data=monthagg, x='month',y='count').set(title = 'Month Vs Count')",No,4,33.0 "from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(new_df.drop(['val','t','mm','nn'],axis=1))",No,5,18.0 "### count,season,hour plt.figure(figsize=(15,8)) houragg = pd.DataFrame(train.groupby(['hour','season'])['count'].mean()).reset_index() sns.pointplot(data=houragg,x=houragg['hour'],y=houragg['count'],hue=houragg['season']).set(title='Hour,Season Vs Count')",Yes,4,33.0 "scaler.transform(new_df.drop(['val','t','mm','nn'],axis=1))",No,5,18.0 "### count,hour,weekday plt.figure(figsize=(15,8)) hourweekagg = pd.DataFrame(train.groupby(['hour','weekday'])['count'].mean()).reset_index() sns.pointplot(data=hourweekagg,x=hourweekagg['hour'],y=hourweekagg['count'],hue=hourweekagg['weekday']).set(title='Hour,Week Vs Count')",Yes,4,33.0 "from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.kernel_ridge import KernelRidge from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone from sklearn.model_selection import KFold, cross_val_score, train_test_split from sklearn.metrics import mean_squared_error import xgboost as xgb import lightgbm as lgb",No,5,22.0 "new_df[['m1','m2','m3','m4','n','m','hour','day']]=scaler.transform(new_df.drop(['val','t','mm','nn'],axis=1))",No,5,18.0 "target = train['count'] target_log=train['count_log'] train = train.drop('count_log',axis=1) train = train.drop('count',axis=1) train = train.drop('atemp',axis=1) train = train.drop('date',axis=1) train = train.drop('casual',axis=1) train = train.drop('registered',axis=1) m_dum = pd.get_dummies(train['month'],prefix='m') ho_dum = pd.get_dummies(train['hour'],prefix='ho') s_dum = pd.get_dummies(train['season'],prefix='s') we_dum = pd.get_dummies(train['weather'],prefix='we') train = pd.concat([train,s_dum,we_dum,m_dum,ho_dum],axis=1) testid = test['datetime'] test['date'] = test.datetime.apply(lambda x: x.split()[0]) test['hour'] = test.datetime.apply(lambda x: x.split()[1].split(':')[0]) test['weekday'] = test.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').weekday()) test['month'] = test.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').month) test = test.drop('datetime',axis=1) test = test.drop('atemp',axis=1) test = test.drop('date',axis=1) s_dum = pd.get_dummies(test['season'],prefix='s') we_dum = pd.get_dummies(test['weather'],prefix='we') m_dum = pd.get_dummies(test['month'],prefix='m') ho_dum = pd.get_dummies(test['hour'],prefix='ho') test= pd.concat([test,s_dum,we_dum,m_dum,ho_dum],axis=1)",Yes,3,10.0 train = new_df[new_df['val'] !=-1],No,5,14.0 "X_train = train.drop(['val','t','mm','nn'], axis=1) y_train = train['val'].values",No,5,21.0 test = new_df[new_df['val'] == -1],No,5,14.0 "X_test = test.drop(['val','t','mm','nn'], axis=1)",No,5,10.0 "gbr = GradientBoostingRegressor(n_estimators=2000, learning_rate=0.01, max_depth=4).fit(train.values, target_log)",No,5,7.0 "import keras from keras.datasets import mnist from keras.models import Sequential from keras.layers import Dense from keras.optimizers import adam",No,5,22.0 "model = Sequential() model.add(Dense(128, activation='relu', input_shape=(8,))) model.add(Dense(256, activation='relu')) model.add(Dense(256, activation='relu')) model.add(Dense(1)) model.summary()",No,5,84.0 "model_gbr = GradientBoostingRegressor(n_estimators=1500,max_depth=5,learning_rate=0.01).fit(train.values,target_log)",No,5,7.0 "prediction = model_gbr.predict(test.values) prediction = np.expm1(prediction)",Yes,4,48.0 "output = pd.DataFrame() output['datetime'] = testid output['count'] = prediction output.to_csv('output.csv',index=False)",Yes,3,25.0 "import glob import os, sys import random from tqdm import tqdm import numpy as np from keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array from keras.models import Sequential from keras.layers import Dropout, Flatten, Dense from keras import applications import seaborn as sns import pandas as pd import matplotlib.pyplot as plt from keras import backend as K K.tensorflow_backend._get_available_gpus()",No,5,23.0 "train_data_dir = '../input/dogs-vs-cats-redux-kernels-edition/train' test_data_dir = '../input/dogs-vs-cats-redux-kernels-edition/test' # Make sure you include https://www.kaggle.com/keras/vgg16/data as your data source vgg_model_path = '../input/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5' epochs = 20 batch_size = 20 img_width, img_height = 150, 150 training_n_bound = 5000 # set to None to use the entire training dataset; it took about 2 hours at my Macbook Pro.",No,5,77.0 "# Wrap training data into pandas' DataFrame. lst = list(gen_image_label(train_data_dir)) random.shuffle(lst) if training_n_bound is not None: lst = lst[:training_n_bound] train_df = pd.DataFrame(lst, columns=['label', 'id', 'filename']) train_df = train_df.sort_values(by=['label', 'id']) train_df.head(3)",Yes,3,12.0 "train_df['label_code'] = train_df.label.map({'cat':0, 'dog':1}) train_df.head(3)",Yes,3,41.0 "# Wrap testing data into pandas' DataFrame. lst = list(gen_image_label(test_data_dir)) test_df = pd.DataFrame(lst, columns=['label', 'id', 'filename']) test_df = test_df.sort_values(by=['label', 'id']) test_df['label_code'] = test_df.label.map({'cat':0, 'dog':1}) test_df.head(3)",Yes,3,12.0 "sns.countplot(train_df.label) plt.title('Number of training images per category')",No,5,33.0 "model.fit(train_embeddings[train_indices,:], train_labels, epochs=epochs, batch_size=batch_size, validation_data=(train_embeddings[validate_indices,:], validation_labels)) model.save_weights(embedding_fc_model)",Yes,3,7.0 "from sklearn.metrics import f1_score, accuracy_score pred_validation = model.predict(train_embeddings[validate_indices,:]) f1 = f1_score(validation_labels, pred_validation > 0.5) acc = accuracy_score(validation_labels, pred_validation > 0.5) (f1, acc)",Yes,4,49.0 "pred_test = model.predict(test_embeddings) pred_test.shape",Yes,3,48.0 "results = pd.DataFrame({'id': pd.Series(test_df.id.values[:pred_test.shape[0]]), 'label': pd.Series(pred_test.T[0])}) results.to_csv('submission.csv', index=False) results.head(10)",Yes,4,12.0 "df = pd.DataFrame(np.concatenate((X, train_labels[:train_embeddings[train_indices,:].shape[0]].reshape(train_embeddings[train_indices,:].shape[0],1)), axis=1), columns=['X', 'Y', 'Z', 'label'])",No,5,12.0 "g = sns.FacetGrid(df, hue=""label"", size=7) g.map(plt.scatter, ""X"", ""Y"", alpha=.5) g.add_legend(); g = sns.FacetGrid(df, hue=""label"", size=7) g.map(plt.scatter, ""Y"", ""Z"", alpha=.5) g.add_legend(); g = sns.FacetGrid(df, hue=""label"", size=7) g.map(plt.scatter, ""X"", ""Z"", alpha=.5) g.add_legend();",No,5,33.0 "from nltk import word_tokenize from nltk.corpus import stopwords import pandas as pd import numpy as np import re from sklearn.metrics import accuracy_score, classification_report from sklearn.preprocessing import LabelEncoder from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.pipeline import Pipeline from scipy import sparse from sklearn.svm import LinearSVC import json from sklearn.model_selection import train_test_split import copy",No,5,22.0 "train, val = train_test_split(raw_train, test_size=0.2, random_state=2018)",No,5,13.0 "pred = [] test_pred = [] for i in range(3): p.get_params()['lr'].class_weight = {0: 1, 1:w[i] } p.fit(train_x, train_y[i]) pred.append(p.decision_function(val_x)) test_pred.append(p.decision_function(test_x))",Yes,3,7.0 "accuracy_score(val_y, np.argmax(np.array(pred), axis=0))",No,5,49.0 "sub_df = pd.DataFrame() sub_df['id'] = [i['id'] for i in raw_test] sub_df['sentiment'] = np.argmax(np.array(test_pred), axis=0) sub_df['sentiment']= sub_df['sentiment'].apply(lambda x: lab.classes_[x])",Yes,4,8.0 "sub_df.to_csv('nb.csv', index=False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from os import listdir
from os.path import join, basename
from PIL import Image
print(listdir(""../input""))
print(listdir("".""))
IMG_HEIGHT = 50
IMG_WIDTH = 50
NUM_CHANNELS = 3

from threading import current_thread, Thread, Lock
from multiprocessing import Queue
# Any results you write to the current directory are saved as output.",Yes,4,22.0 "# initializations related to threading stuff batch_size = 500 num_train_images = 25000 num_test_images = 12500 num_train_threads = int(num_train_images/batch_size) # 50 num_test_threads = int(num_test_images/batch_size) # 25 lock = Lock()",No,4,77.0 "print(train_x.shape) print(len(train_y))",No,5,58.0 "test_x =get_testing_data() print(test_x.shape)",Yes,4,58.0 "print(""train_x shape"",train_x.shape) print(""test_x shape"", test_x.shape) # convert train_y to np. array train_y = np.array(train_y) print(""train_y.shape"", train_y.shape)",No,5,58.0 "# mean normalize train and test images train_x = train_x/255 test_x = test_x/255",No,5,8.0 "# import required packages from keras.models import Sequential from keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization from keras.layers import Conv2D, MaxPooling2D from keras.utils import np_utils, to_categorical from sklearn.model_selection import train_test_split",No,5,22.0 "from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
tfidf = CountVectorizer(stop_words=\'english\', max_features=200)
tfidf.fit(list(train_df[\'features\']) + list(test_df[\'features\']))
tr_sparse = tfidf.transform(train_df[""features""])
te_sparse = tfidf.transform(test_df[""features""])
print(te_sparse)
",No,4,8.0 "print(train_df[features_to_use].head()) ",No,5,41.0 "train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr() ",No,4,32.0 "test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr() ",No,4,32.0 "target_num_map = {'high':0, 'medium':1, 'low':2} train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x])) print(train_X.shape, test_X.shape) ",No,4,20.0 "preds, model = runXGB(train_X, train_y, test_X, num_rounds=818) out_df = pd.DataFrame(preds) out_df.columns = [""high"", ""medium"", ""low""] out_df[""listing_id""] = test_df.listing_id.values out_df.to_csv(""xgb_starter2.csv"", index=False) ",No,4,55.0 "%matplotlib inline import matplotlib.pyplot as plt from tqdm import tqdm_notebook import numpy as np import pandas as pd import torch import torchvision import torchvision.transforms as transforms import torch.nn as nn import torch.nn.functional as F",No,5,23.0 torch.__version__,No,4,23.0 from torch.autograd import Variable,No,5,22.0 "use_gpu = torch.cuda.is_available() use_gpu",No,4,77.0 !ls ../input/cifar10-python/,No,5,88.0 !tar -zxvf ../input/cifar10-python/cifar-10-python.tar.gz,No,5,73.0 "model.fit(X_train, y_train, batch_size=128, epochs=240, verbose=1,validation_split=0.2)",No,5,7.0 "from xgboost import XGBRegressor model_XGB = XGBRegressor() model_XGB.fit(X_train,y_train)",Yes,4,4.0 predict = model.predict(X_test),No,5,48.0 X_test['demand'] = X_test['pred'].apply(make_positive),No,5,8.0 "X_test[['id','demand']].to_csv('result.csv',index=False)",No,5,25.0 "import numpy as np import pandas as pd import random import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import BaggingClassifier from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB from sklearn.neighbors import NearestCentroid, RadiusNeighborsClassifier, KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from scipy import stats from sklearn.svm import SVC, LinearSVC, NuSVC from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import normalize from sklearn.utils import shuffle from sklearn import decomposition, cross_decomposition # Input data files are available in the ""../input/"" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory import os print(os.listdir(""../input"")) # number of attributes for pca decompose_to = 5",No,5,88.0 " # show data on 2d plots positives = train_data_pca[train_labels[:]==1] negatives = train_data_pca[train_labels[:]==0] positivest = test_data_pca[test_predicted[:]==1] negativest = test_data_pca[test_predicted[:]==0] for j in range(decompose_to): plt.figure(figsize=(20,15)) for i in range(decompose_to): plt.subplot(4,5,i+1) axis = [j,i] a=positives[:,axis] plt.scatter(*zip(*a), color='r') a=negatives[:,axis] plt.scatter(*zip(*a), color='b') plt.title(str(axis)) plt.show() plt.figure(figsize=(20,15)) for i in range(decompose_to): plt.subplot(4,5,i+1) axis = [j,i] a=positivest[:,axis] plt.scatter(*zip(*a), color='g') a=negativest[:,axis] plt.scatter(*zip(*a), color='c') plt.title(str(axis)) plt.show()",No,5,33.0 "# PREDICTING THE EFFECTS OF GENETIC VARIATIONS USING LGBM # BY - OMKAR SABNIS - 29-05-2018 #Importing library import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import re import nltk from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.preprocessing import StandardScaler,OneHotEncoder from sklearn.preprocessing import LabelEncoder from sklearn.metrics import confusion_matrix,mean_squared_error from sklearn.model_selection import KFold, cross_val_score,train_test_split from sklearn.naive_bayes import GaussianNB,MultinomialNB from sklearn.ensemble import RandomForestClassifier import lightgbm as lgb",No,5,22.0 "# READING THE DATASETS
train = pd.read_csv(""../input/training_variants"")
trainx = pd.read_csv(\'../input/training_text\',sep = \'\\|\\|\', engine= \'python\', header=None,
skiprows=1, names=[""ID"",""Text""])
train = pd.merge(train, trainx, how = \'left\', on = \'ID\').fillna(\'\')
train.head()",Yes,3,45.0 "test = pd.read_csv(""../input/stage2_test_variants.csv"")
testx = pd.read_csv(\'../input/stage2_test_text.csv\',sep = \'\\|\\|\', engine= \'python\', header=None,
skiprows=1, names=[""ID"",""Text""])
test = pd.merge(test, testx, how = \'left\', on = \'ID\').fillna(\'\')
test.head()",Yes,3,45.0 "def textlen(train): k = train['Text'].apply(lambda x: len(str(x).split())) l = train['Text'].apply(lambda x: len(str(x))) return k, l train['Text_no_word'], train['Text_no_char'] = textlen(train) test['Text_no_word'], test['Text_no_char'] = textlen(test)",No,5,8.0 "tfidf = TfidfVectorizer(
\tmin_df=1, max_features=1600, strip_accents='unicode',lowercase =True,
\tanalyzer='word', token_pattern=r'\\w+', ngram_range=(1, 3), use_idf=True,
\tsmooth_idf=True, sublinear_tf=True, stop_words = 'english')
X_train = tfidf.fit_transform(train['Text']).toarray()
print(X_train)
X_test = tfidf.fit_transform(test['Text']).toarray()

def encoding(df,col):
le = LabelEncoder()
for i in col:
df[i] = le.fit_transform(df[i])
train.columns
col = ['Gene', 'Variation', 'Class']
encoding(train,col)
encoding(test,['Gene', 'Variation'])

X_train = pd.DataFrame(X_train)
X_train = X_train.join(train[['Gene', 'Variation', 'Text_no_word','Text_no_char']])
X_test = pd.DataFrame(X_test)
X_test = X_test.join(test[['Gene', 'Variation', 'Text_no_word','Text_no_char']])",Yes,3,8.0 "# FEATURE SCALING sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) y_train = train['Class']",No,5,21.0 "xtr,xvl,ytr,yvl = train_test_split(X_train,y_train,test_size=0.3,random_state=10)",No,5,13.0 predictions = [0 if i < 0 else i for i in predictions],No,1,53.0 "output = pd.DataFrame({'datetime': test_data.index, 'count': predictions}) output.to_csv('submission.csv', index=False)",Yes,3,55.0 "# SUBMISSION OF FILE IN CSV FORMAT: submit = pd.DataFrame(test.ID) submit = submit.join(pd.DataFrame(pred_test)) submit.columns = ['ID', 'class1','class2','class3','class4','class5','class6','class7','class8','class9'] submit.to_csv('submission.csv', index=False) ",Yes,3,25.0 "import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer from scipy.sparse import hstack from sklearn.model_selection import train_test_split import time import lightgbm as lgb import math from sklearn.metrics import mean_squared_error,mean_absolute_error,log_loss import sklearn from sklearn.ensemble import RandomForestClassifier import itertools import xgboost as xgb import random import datetime from wordcloud import WordCloud import re import plotly.offline as py py.init_notebook_mode(connected=True) import plotly.graph_objs as go from plotly import tools from plotly.offline import download_plotlyjs, init_notebook_mode, iplot pd.options.mode.chained_assignment = None # default='warn' %matplotlib inline",No,5,23.0 "train = pd.read_json('../input/train.json') test = pd.read_json('../input/test.json')",No,5,44.0 "merge.loc[merge['bathrooms'] > 7 , 'bathrooms'] = 7 merge['rooms'] = merge['bathrooms'] + merge['bedrooms'] merge['rooms_diff'] = merge['bathrooms'] - merge['bedrooms'] merge['half_bathrooms'] = ((merge['rooms'] - np.floor(merge['rooms'])) > 0).astype(int) features_to_use = np.concatenate([features_to_use, ['bathrooms', 'bedrooms', 'rooms', 'rooms_diff', 'half_bathrooms']]) features_to_use = np.unique(features_to_use) features_to_use",No,4,8.0 "merge.loc[merge[\'latitude\'] < 1, \'latitude\'] = merge[\'latitude\'].mode()[0]
merge.loc[merge[\'longitude\']>-1, \'longitude\'] = merge[\'longitude\'].mode()[0]
scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1))

merge[\'latitude\'] = scaler.fit_transform(np.array(merge[\'latitude\']).reshape(-1,1))
scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(0,1))
merge[\'longitude\'] = scaler.fit_transform(np.array(merge[\'longitude\']).reshape(-1,1))

merge[""pos""] = merge.longitude.round(3).astype(str) + \'_\' + merge.latitude.round(3).astype(str)
pos_vc = merge[\'pos\'].value_counts()
d_pos_vc = pos_vc.to_dict()
merge[\'density\'] = merge[""pos""].apply(lambda x: d_pos_vc.get(x, pos_vc.min()))

features_to_use = np.concatenate([features_to_use, [\'latitude\', \'longitude\', \'density\']])
features_to_use = np.unique(features_to_use)
features_to_use",Yes,2,18.0 "merge[\'num_description_len\'] = merge[\'description\'].str.len()
merge[\'num_description_words\'] = merge[\'description\'].apply(lambda x:len(x.split(\' \')))
merge[\'price_per_bedrooms\'] = merge[\'price\']/merge[\'bedrooms\']
merge[\'price_per_bathrooms\'] = merge[\'price\']/merge[\'bathrooms\']
merge[\'price_per_rooms\'] = merge[\'price\']/merge[\'rooms\']
merge[\'beds_percent\'] = merge[\'bedrooms\']/merge[\'rooms\']
merge[\'num_capital_letters\'] = merge[\'description\'].apply(lambda x: sum(1 for c in x if c.isupper()))
merge[\'num_address_len\'] = merge[\'display_address\'].str.len()
merge[\'num_address_words\'] = merge[\'display_address\'].apply(lambda x:len(x.split(\' \')))
merge[\'address_east\'] = merge[\'street_address\'].apply(lambda x: x.find(\'East\') > -1).astype(int)
merge[\'address_west\'] = merge[\'street_address\'].apply(lambda x: x.find(\'West\') > -1).astype(int)
merge[\'num_photos\'] = merge[\'photos\'].str.len()
merge[\'num_features\'] = merge[\'features\'].str.len()
merge[\'num_photos_low\'] = merge[\'num_photos\'].apply(lambda x:1 if x > 22 else 0) # all is low
merge[\'price_low_medium\'] = merge[\'price\'].apply(lambda x:1 if 7500< x < 10000 else 0) # all is low or medium
merge[\'price_low\'] = merge[\'price\'].apply(lambda x:1 if x >= 10000 else 0) # all is low
def cap_share(x):
return sum(1 for c in x if c.isupper())/float(len(x) + 1)
merge[\'num_cap_share\'] = merge[\'description\'].apply(cap_share)
merge[\'num_description_lines\'] = merge[\'description\'].apply(lambda x: x.count(\'

\'))
merge[\'num_redacted\'] = 0
merge[\'num_redacted\'].ix[merge[\'description\'].str.contains(\'website_redacted\')] = 1
merge[\'num_email\'] = 0
merge[\'num_email\'].ix[merge[\'description\'].str.contains(\'@\')] = 1

reg = re.compile("".*?(\\(?\\d{3}\\D{0,3}\\d{3}\\D{0,3}\\d{4}).*?"", re.S)
def try_and_find_nr(description):
if reg.match(description) is None:
return 0
return 1
merge[\'num_phone_nr\'] = merge[\'description\'].apply(try_and_find_nr)



features_to_use = np.concatenate([features_to_use, [\'num_description_len\', \'num_description_words\',
\'price_per_bedrooms\', \'price_per_bathrooms\', \'price_per_rooms\', \'num_photos\', \'num_features\',
\'num_photos_low\', \'price_low_medium\', \'price_low\',
\'beds_percent\', \'num_capital_letters\', \'num_address_len\',
\'num_address_words\', \'address_east\', \'address_west\',
\'num_cap_share\', \'num_description_lines\',
\'num_redacted\', \'num_email\', \'num_phone_nr\']])
features_to_use = np.unique(features_to_use)
features_to_use",Yes,4,8.0 "interest_level_dict = {'low' : 0, 'medium' : 1, 'high' : 2 } merge['interest'] = merge['interest_level'].map(interest_level_dict)",No,5,20.0 "created_time = pd.to_datetime(merge['created'],format='%Y-%m-%d %H:%M:%S') merge['month'] = created_time.dt.month merge['day'] = created_time.dt.day merge['hour'] = created_time.dt.hour merge['weekday'] = created_time.dt.weekday merge['week'] = created_time.dt.week merge['quarter'] = created_time.dt.quarter merge['weekend'] = ((merge['weekday'] == 5) | (merge['weekday'] == 6)) merge['days_since'] = created_time.max() - created_time merge['days_since'] = (merge['days_since'] / np.timedelta64(1, 'D')).astype(int) features_to_encode = np.concatenate([features_to_encode, ['month', 'day', 'hour', 'weekday', 'week', 'quarter', 'hour', 'weekend']]) features_to_encode = np.unique(features_to_encode) features_to_encode",Yes,4,8.0 "display_address_min_df = 10 street_address_min_df = 10 features_min_df = 10 description_max_features = 20",No,5,77.0 "cv = CountVectorizer(min_df=display_address_min_df) X_display_address = cv.fit_transform(merge['display_address']) cv = CountVectorizer(min_df=street_address_min_df) X_street_address = cv.fit_transform(merge['street_address']) merge['features_'] = merge['features'].apply(lambda x:' '.join(['_'.join(k.split(' ')) for k in x])) cv = CountVectorizer(stop_words='english', max_features=200) X_features = cv.fit_transform(merge['features_']) tv = TfidfVectorizer(max_features=description_max_features, ngram_range=(1, 5), stop_words='english') X_description = tv.fit_transform(merge['description']) X_vectorized = hstack((X_display_address, X_street_address, X_features, X_description)).tocsr()",Yes,4,8.0 "ohe = sklearn.preprocessing.OneHotEncoder() X_encode = ohe.fit_transform(merge[features_to_encode])",No,5,20.0 "def union_features(features_to_use, X_encode, X_vectorized, target, nrow_train): X_origin = merge[features_to_use] X_origin.fillna(0 ,inplace=True) X = hstack((X_origin, X_encode, X_vectorized)).tocsr() y = merge[target] X_train_all = X[:nrow_train] X_test = X[nrow_train:] y_train_all = y[:nrow_train] # y_test = y[nrow_train:] X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.2 , random_state=10) return X_train, X_test, X_val, y_train, y_val X_train, X_test, X_val, y_train, y_val = union_features(features_to_use, X_encode, X_vectorized, target, nrow_train)",Yes,4,21.0 merge.info(),No,5,40.0 "Y_pred = model.predict(xgb.DMatrix(X_test)) ids = np.array(test['listing_id'])",No,5,48.0 "preds = pd.DataFrame({""listing_id"": ids, ""high"":Y_pred[:, 0],
""medium"":Y_pred[:, 1], ""low"":Y_pred[:, 2]})
preds.to_csv(\'my_submission.csv\' ,index=False)",Yes,4,25.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import itertools import graphviz from sklearn.preprocessing import Imputer from sklearn.model_selection import train_test_split, cross_val_score, KFold from sklearn.metrics import confusion_matrix from sklearn import tree from sklearn.tree import DecisionTreeClassifier from sklearn.impute import SimpleImputer from pandas import read_csv %matplotlib inline",Yes,3,22.0 "predict = pd.DataFrame(index=output['datetime']) predict['count'] = output['count'].values predict.head() plot_timeseries_train_and_predict(train_data, predict, 2011, 2)",No,2,41.0 "fig = plt.figure(figsize=(16,9)) plt.plot(train_data.index, train_data['count'], 'b', label = 'train') plt.plot(output['datetime'],output['count'], 'r', label = 'test') plt.title('Train and Test') plt.legend() plt.grid()",No,5,75.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import itertools import graphviz from sklearn.preprocessing import Imputer from sklearn import preprocessing #hy from sklearn.preprocessing import StandardScaler #hy from sklearn.model_selection import train_test_split, cross_val_score, KFold from sklearn.metrics import confusion_matrix from sklearn import tree from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.neural_network import MLPClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.ensemble import GradientBoostingClassifier from sklearn.impute import SimpleImputer from pandas import read_csv %matplotlib inline",Yes,4,22.0 "def plot_decision_tree(a,b):
""""""
http://scikit-learn.org/stable/modules/tree.html
""""""
dot_data = tree.export_graphviz(a, out_file=None, feature_names=b,class_names=[\'Healthy\',\'Diabetes\'],filled=False, rounded=True,special_characters=False)
graph = graphviz.Source(dot_data)
return graph

def plot_confusion_matrix(cm, classes,normalize=False,title=\'Confusion matrix\',cmap=plt.cm.Blues):
""""""
http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
""""""
plt.imshow(cm, interpolation=\'nearest\', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = \'.2f\' if normalize else \'d\'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),horizontalalignment=""center"",color=""white"" if cm[i, j] > thresh else ""black"")
plt.tight_layout()
plt.ylabel(\'True label\')
plt.xlabel(\'Predicted label\')

models = []
models.append((""LR"",LogisticRegression()))
models.append((""NB"",GaussianNB()))
models.append((""KNN"",KNeighborsClassifier()))
models.append((""DT"",DecisionTreeClassifier()))
models.append((""SVM"",SVC()))",Yes,1,80.0 "dataset = read_csv('../input/train.csv') dataset=dataset[['Id','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome']] dataset.head(10)",Yes,2,41.0 "dataset2 = dataset.iloc[:, :-1]
print(""# of Rows, # of Columns: "",dataset2.shape)
print(""\
Column Name # of Null Values\
"")
print((dataset2[:] == 0).sum())",No,4,39.0 "trainingData = read_csv(\'../input/train.csv\')
trainingData=trainingData[[\'Id\',\'Pregnancies\',\'Glucose\',\'BloodPressure\',\'SkinThickness\',\'Insulin\',\'BMI\',\'DiabetesPedigreeFunction\',\'Age\',\'Outcome\']]
testingData = read_csv(\'../input/test.csv\')
testingData=testingData[[\'Id\',\'Pregnancies\',\'Glucose\',\'BloodPressure\',\'SkinThickness\',\'Insulin\',\'BMI\',\'DiabetesPedigreeFunction\',\'Age\']]
trainingFeatures = trainingData.iloc[:, :-1]
trainingLabels = trainingData.iloc[:, -1]
imputer = SimpleImputer(missing_values=0,strategy=\'median\')
trainingFeatures = imputer.fit_transform(trainingFeatures)
trainingFeatures = pd.DataFrame(trainingFeatures)
trainingFeatures.columns=[\'Id\',\'Pregnancies\',\'Glucose\',\'BloodPressure\',\'SkinThickness\',\'Insulin\',\'BMI\',\'DiabetesPedigreeFunction\',\'Age\']
#further feature engineering
#trainingData[\'Glucose\'] = ((trainingData[\'Glucose\'] >= 160)|(trainingData[\'Glucose\'] <= 100)).astype(int) #hy
#trainingData[\'Pregnancies\'] = (trainingData[\'Pregnancies\'] >= 5).astype(int)
#trainingData[\'Insulin\'] = (trainingData[\'Insulin\'] >= 200).astype(int)
#trainingData[\'DiabetesPedigreeFunction\'] = (trainingData[\'DiabetesPedigreeFunction\'] >= 0.5).astype(int)#hy
#print(trainingData[:])

testingData = imputer.transform(testingData)
testingData = pd.DataFrame(testingData)
#testingData.columns=[\'Id\',\'Pregnancies\',\'Glucose\',\'BloodPressure\',\'SkinThickness\',\'Insulin\',\'BMI\',\'DiabetesPedigreeFunction\',\'Age\']
#testingData[\'Glucose\'] = ((testingData[\'Glucose\'] >= 160)|(testingData[\'Glucose\'] <= 100)).astype(int) #hy
#testingData[\'Pregnancies\'] = (testingData[\'Pregnancies\'] >= 5).astype(int)
#testingData[\'Insulin\'] = (testingData[\'Insulin\'] >= 200).astype(int)
#testingData[\'DiabetesPedigreeFunction\'] = (testingData[\'DiabetesPedigreeFunction\'] >= 0.5).astype(int)

print(""# of Rows, # of Columns: "",trainingFeatures.shape)
print(""\
Column Name # of Null Values\
"")
print((trainingFeatures[:] == 0).sum())",Yes,1,12.0 "g = sns.heatmap(trainingFeatures.corr(),cmap=""Blues"",annot=False)",No,5,80.0 "#trainingFeatures2 = trainingFeatures.drop(['Pregnancies','BloodPressure','DiabetesPedigreeFunction', 'Age','SkinThickness','Insulin','Id'], axis=1) trainingFeatures2 = trainingFeatures.drop(['Id'], axis=1)",No,5,10.0 "g = sns.heatmap(trainingFeatures2.corr(),cmap=""Blues"",annot=False) print(trainingFeatures2.corr())",Yes,3,80.0 "
#model = DecisionTreeClassifier(max_depth=8,min_samples_leaf=2)
#0.70-no norm . 0.76--w/ normalization
""""""
model = LogisticRegression(penalty=\'l2\', dual=False, tol=0.0001,
C=1.0, fit_intercept=True, intercept_scaling=1,
class_weight=None, random_state=10, solver=\'liblinear\',
max_iter=100, multi_class=\'ovr\', verbose=0, warm_start=False, n_jobs=1)
""""""
#model =GaussianNB() #74.17
#model= RandomForestClassifier(max_depth=6, random_state=0)
#"""""" %77.00
model= RandomForestClassifier(bootstrap=True, class_weight=None, criterion=\'gini\',
max_features=\'auto\', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1,
random_state=10, verbose=0, warm_start=False)
#""""""
#model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=6), n_estimators=150, random_state=10) #74

""""""70.07
model = MLPClassifier(activation=\'relu\', alpha=1e-05, batch_size=\'auto\',
beta_1=0.9, beta_2=0.999, early_stopping=False,
epsilon=1e-08, hidden_layer_sizes=(5, 2), learning_rate=\'constant\',
learning_rate_init=0.001, max_iter=200, momentum=0.9,
nesterovs_momentum=True, power_t=0.5, random_state=10, shuffle=True,
solver=\'lbfgs\', tol=0.0001, validation_fraction=0.1, verbose=False,
warm_start=False)
""""""
#model = LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=\'auto\', solver=\'eigen\', store_covariance=False, tol=0.0001)
#model = QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0, store_covariance=False, store_covariances=None, tol=0.0001)

#SVC: rbf(64) linear(76) poly-3
""""""
model = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=\'ovr\', degree=2, gamma=\'auto\', kernel=\'linear\',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
""""""
#gradientBoostingClassifier . 0.77
#params = {\'max_depth\':9, \'subsample\':0.5, \'learning_rate\':0.01, \'min_samples_leaf\':1, \'random_state\':0}
#model = GradientBoostingClassifier(n_estimators=290,**params)

#model = GradientBoostingClassifier()

X_train, X_test, y_train, y_test = train_test_split(trainingFeatures2, trainingLabels, test_size=0.1, random_state=10)


#original
model.fit(X_train, y_train)

#scaler = StandardScaler()
#X_train_scaler = scaler.fit_transform(X_train)
#model.fit(X_train_scaler, y_train)

columns = trainingFeatures2.columns
feature_names = trainingFeatures2.columns.values

#coefficients = model.feature_importances_.reshape(trainingFeatures2.columns.shape[0], 1)
#absCoefficients = abs(coefficients)
#fullList = pd.concat((pd.DataFrame(columns, columns = [\'Feature\']), pd.DataFrame(absCoefficients, columns = [\'absCoefficient\'])), axis = 1).sort_values(by=\'absCoefficient\', ascending = False)
#print(\'\
Feature Importance:\
\
\',fullList,\'\
\')
#plot_decision_tree(model,feature_names)",Yes,1,21.0 "kfold = KFold(n_splits=10, random_state=10)
results = cross_val_score(model, trainingFeatures2, trainingLabels, cv=kfold)
#print(""DecisionTreeClassifier:\
\
Cross_Val_Score: %.2f%% (%.2f%%)"" % (results.mean()*100, results.std()*100))
print(""Logistic Regression Classifier:\
\
Cross_Val_Score: %.2f%% (%.2f%%)"" % (results.mean()*100, results.std()*100))
#origina
prediction = model.predict(X_test)

#X_test_scaler = scaler.fit_transform(X_test)
#prediction = model.predict(X_test_scaler)

cnf_matrix = confusion_matrix(y_test, prediction)
dict_characters = {0: \'Healthy\', 1: \'Diabetes\'}
plot_confusion_matrix(cnf_matrix, classes=dict_characters,title=\'Confusion matrix\')",Yes,2,48.0 "test = testingData test = pd.DataFrame(test) test.columns=['Id','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'] #test2 = test.drop(['Id','Pregnancies','BloodPressure','DiabetesPedigreeFunction', 'Age','SkinThickness','Insulin'], axis=1) test2 = test.drop(['Id' ], axis=1) #test2_scaler = scaler.fit_transform( test2) my_predictions = model.predict(test2) #my_predictions = model.predict(test2_scaler) Identifier = test.Id.astype(int) my_submission = pd.DataFrame({'Id': Identifier, 'Outcome': my_predictions}) my_submission.to_csv('my_submission.csv', index=False) my_submission.head(10)",Yes,1,12.0 "import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import GridSearchCV from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.neural_network import MLPClassifier from sklearn import svm %matplotlib inline",Yes,3,22.0 "X = train.drop(['Outcome'], axis = 1) y = train.Outcome",Yes,3,10.0 "# parameters = {'criterion': ('gini', 'entropy'), 'n_estimators': [10, 50, 100, 105, 150]} # gb = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) # clf = GridSearchCV(gb, parameters) clf.fit(X,y)",Yes,2,4.0 predicted = clf.predict(test),No,5,48.0 print(predicted),No,1,53.0 "output = pd.DataFrame(predicted,columns = ['Outcome']) test = pd.read_csv('../input/test.csv') output['Id'] = test['Id'] output[['Id','Outcome']].to_csv('submission.csv', index = False) output.head()",Yes,1,12.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import itertools import graphviz from sklearn.preprocessing import Imputer from sklearn.model_selection import train_test_split, cross_val_score, KFold from sklearn.metrics import confusion_matrix from sklearn import tree from sklearn.tree import DecisionTreeClassifier from sklearn.impute import SimpleImputer from pandas import read_csv from sklearn.ensemble import GradientBoostingClassifier from xgboost import XGBClassifier %matplotlib inline",Yes,3,22.0 "trainingData = read_csv('../input/train.csv') trainingData=trainingData[['Id','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome']] testingData = read_csv('../input/test.csv') testingData=testingData[['Id','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']] trainingFeatures = trainingData.iloc[:, :-1] trainingLabels = trainingData.iloc[:, -1] imputer = SimpleImputer(missing_values=0,strategy='median') trainingFeatures = imputer.fit_transform(trainingFeatures) trainingFeatures = pd.DataFrame(trainingFeatures) trainingFeatures.columns=['Id','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'] testingData = imputer.transform(testingData) testingData = pd.DataFrame(testingData) testingData.columns=['Id','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'] trainingFeatures2 = trainingFeatures.drop(['Id'], axis=1)",Yes,1,12.0 "#model = DecisionTreeClassifier(max_depth=2,min_samples_leaf=2)
model = GradientBoostingClassifier(n_estimators=110, max_depth=3, min_samples_split=310, min_samples_leaf=5, max_features=7, subsample=0.85, learning_rate=0.1)
#model = XGBClassifier( learning_rate =0.1, n_estimators=10, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= \'binary:logistic\', nthread=4, scale_pos_weight=1, seed=27)
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, trainingFeatures2, trainingLabels, cv=kfold)
print(""DecisionTreeClassifier:\
\
Cross_Val_Score: %.2f%% (%.2f%%)"" % (results.mean()*100, results.std()*100))

model.fit(trainingFeatures2, trainingLabels)",Yes,1,4.0 "test = testingData test = pd.DataFrame(test) test.columns=['Id','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'] test2 = test.drop(['Id'], axis=1) my_predictions = model.predict(test2) Identifier = test.Id.astype(int) my_submission = pd.DataFrame({'Id': Identifier, 'Outcome': my_predictions}) my_submission.to_csv('my_submission.csv', index=False)",Yes,1,12.0 "from __future__ import print_function import math from IPython import display from matplotlib import cm from matplotlib import gridspec from matplotlib import pyplot as plt import numpy as np import pandas as pd from sklearn import metrics import tensorflow as tf from tensorflow.python.data import Dataset import seaborn as sns tf.logging.set_verbosity(tf.logging.ERROR) pd.options.display.max_rows = 10 pd.options.display.float_format = '{:.1f}'.format ",Yes,3,22.0 "data = pd.read_csv(""../input/train.csv"")",No,5,45.0 "data = data.reindex( np.random.permutation(data.index)) data.head()",Yes,3,41.0 "sns.heatmap(data.corr(), annot=True)",No,5,80.0 data.isnull().sum(),No,5,39.0 data['Outcome'].hist(bins = 20),No,5,33.0 "sns.pairplot(data, hue='Outcome')",No,5,33.0 "data[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']].hist(figsize=(16, 10), bins=50, xlabelsize=8, ylabelsize=8);",No,5,33.0 "data=data[data['Pregnancies']<=11] data=data[data['BMI']>=12] data=data[data['BloodPressure']>40] data=data[data['Glucose']>40] data=data[data['SkinThickness']<60]",No,5,14.0 data.describe(),No,4,40.0 "def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None): # Convert pandas data into a dict of np arrays. features = {key:np.array(value) for key,value in dict(features).items()} # Construct a dataset, and configure batching/repeating. ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit ds = ds.batch(batch_size).repeat(num_epochs) # Shuffle the data, if specified. if shuffle: ds = ds.shuffle(10000) # Return the next batch of data. features, labels = ds.make_one_shot_iterator().get_next() return features, labels",Yes,1,12.0 "def train_linear_classifier_model(
learning_rate,
steps,
hidden_units,
batch_size,
training_examples,
training_targets,
validation_examples,
validation_targets):
periods = 45
steps_per_period = steps / periods
# Create a linear classifier object.
my_optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
DNN_classifier = tf.estimator.DNNClassifier(
feature_columns=construct_feature_columns(training_examples),
hidden_units=hidden_units,
optimizer=my_optimizer
)
# Create input functions.
training_input_fn = lambda: my_input_fn(training_examples,
training_targets[""Outcome""],
batch_size=batch_size)
predict_training_input_fn = lambda: my_input_fn(training_examples,
training_targets[""Outcome""],
num_epochs=1,
shuffle=False)
predict_validation_input_fn = lambda: my_input_fn(validation_examples,
validation_targets[""Outcome""],
num_epochs=1,
shuffle=False)

# Train the model, but do so inside a loop so that we can periodically assess
# loss metrics.
print(""Training model..."")
print(""LogLoss (on training data):"")
training_log_losses = []
validation_log_losses = []
for period in range (0, periods):
# Train the model, starting from the prior state.
DNN_classifier.train(
input_fn=training_input_fn,
steps=steps_per_period
)
# Take a break and compute predictions.
training_probabilities = DNN_classifier.predict(input_fn=predict_training_input_fn)
training_probabilities = np.array([item[\'probabilities\'] for item in training_probabilities])

validation_probabilities = DNN_classifier.predict(input_fn=predict_validation_input_fn)
validation_probabilities = np.array([item[\'probabilities\'] for item in validation_probabilities])

training_log_loss = metrics.log_loss(training_targets, training_probabilities)
validation_log_loss = metrics.log_loss(validation_targets, validation_probabilities)
# Occasionally print the current loss.
print("" period %02d : %0.2f"" % (period, training_log_loss))
# Add the loss metrics from this period to our list.
training_log_losses.append(training_log_loss)
validation_log_losses.append(validation_log_loss)
print(""Model training finished."")
# Output a graph of loss metrics over periods.
plt.ylabel(""LogLoss"")
plt.xlabel(""Periods"")
plt.title(""LogLoss vs. Periods"")
plt.tight_layout()
plt.plot(training_log_losses, label=""training"")
plt.plot(validation_log_losses, label=""validation"")
plt.legend()
return DNN_classifier",Yes,1,4.0 "DNN_classifier = train_linear_classifier_model( learning_rate=0.001, steps=800, batch_size=80, hidden_units=[10, 10,10], training_examples=training_examples, training_targets=training_targets, validation_examples=validation_examples, validation_targets=validation_targets)",No,4,4.0 "testData = pd.read_csv(""../input/test.csv"")",No,5,45.0 testData.head(),No,5,41.0 "testData.isna().sum() testData['Outcome'] = 0",Yes,4,39.0 "test_examples = preprocess_features(testData) test_examples.head()",Yes,3,41.0 test_validations = preprocess_targets(testData),No,1,53.0 "predict_test_input_fn = lambda: my_input_fn(test_examples,
test_validations[""Outcome""],
num_epochs=1,
shuffle=False)

test_predictions = DNN_classifier.predict(input_fn=predict_test_input_fn)
test_predictions = np.array([item[\'probabilities\'][1] for item in test_predictions])

_ = plt.hist(test_predictions)",Yes,2,48.0 "testData[['Id','Outcome']].to_csv('Submit.csv', index = False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))

#id,y,Usage
#1,100,Public
#2,100,Private

df = pd.DataFrame({\'id\': [1,2], \'y\': [100,100]})
df.head()
df.to_csv(""submission.csv"", header = True, index = False)
# Any results you write to the current directory are saved as output.",Yes,4,22.0 " import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from pandas import DataFrame, Series import random from tqdm import tqdm import os import math import numpy as np import h5py import matplotlib.pyplot as plt import tensorflow as tf from tensorflow.python.framework import ops import cv2 from keras.utils import to_categorical import glob from matplotlib import pyplot as plt import cv2 from keras.models import Sequential, Model from keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, Flatten, MaxPool2D from keras.optimizers import adam from keras import regularizers from keras.utils import plot_model from keras.applications.vgg19 import VGG19 from keras.layers import Input, Dense, Dropout from keras import backend as K # Any results you write to the current directory are saved as output.",No,5,22.0 "train_path = '../input/train/*.jpg' x_train_adres = glob.glob(train_path) m_train = len(x_train_adres) y_train = np.zeros((m_train,1)) for i,ca in enumerate(x_train_adres): if 'cat' in ca: y_train[i] = 1 print(y_train.shape) # print(y_train) # print(x_train_adres[m_train-1]) ",Yes,2,58.0 "trainingFeatures2 = trainingFeatures.drop(['Pregnancies','BloodPressure','DiabetesPedigreeFunction', 'Age','SkinThickness','Insulin','Id'], axis=1)",No,5,10.0 "test = testingData test = pd.DataFrame(test) test.columns=['Id','Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'] test2 = test.drop(['Id','Pregnancies','BloodPressure','DiabetesPedigreeFunction', 'Age','SkinThickness','Insulin'], axis=1) my_predictions = model.predict(test2) Identifier = test.Id.astype(int) my_submission = pd.DataFrame({'Id': Identifier, 'Outcome': my_predictions}) my_submission.to_csv('my_submission.csv', index=False) my_submission.head()",Yes,2,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",No,5,88.0 "import numpy as np import pandas as pd train = pd.read_csv(""../input/train.csv"") test = pd.read_csv(""../input/test.csv"")",Yes,4,45.0 "X_train = train.iloc[:, 2:-1].values Y_train = train.iloc[:, 21].values X_test = test.iloc[:, 2:].values",No,5,21.0 "X_all = np.concatenate((X_train, X_test), axis=0) ",No,5,11.0 "from sklearn.preprocessing import Imputer imputer = Imputer(missing_values = 'NaN', strategy = 'most_frequent', axis = 0) imputer = imputer.fit(X_all) X_all = imputer.transform(X_all)",No,5,17.0 "from sklearn.preprocessing import QuantileTransformer sc = QuantileTransformer() X = sc.fit_transform(X) X_test = sc.transform(X_test)",No,4,8.0 "from sklearn.kernel_approximation import AdditiveChi2Sampler sc = AdditiveChi2Sampler() X = sc.fit_transform(X) X_test = sc.transform(X_test)",No,3,8.0 "from sklearn.ensemble import AdaBoostClassifier clf = AdaBoostClassifier(n_estimators = 60 , learning_rate = 0.3) clf.fit(X, Y) Y_pred = clf.predict(X) from sklearn.metrics import accuracy_score accuracy_score(Y, Y_pred)",Yes,3,4.0 Y_test_pred = clf.predict(X_test),No,5,48.0 "cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': Y_test_pred }
submission = pd.DataFrame(cols)
print(submission)

submission.to_csv(""submission.csv"", index=False)",Yes,4,25.0 "import numpy as np import pandas as pd from sklearn.model_selection import KFold from sklearn.metrics import accuracy_score from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import ExtraTreesClassifier from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import QuantileTransformer from sklearn.decomposition import PCA from sklearn.preprocessing import RobustScaler from sklearn.preprocessing import KernelCenterer from sklearn import svm",No,5,22.0 "data = pd.read_csv(""../input/train.csv"") test = pd.read_csv(""../input/test.csv"") submition = pd.read_csv(""../input/sample_submission.csv"")",No,5,45.0 "input_label = np.array(data.get('TARGET_5Yrs')) data = data.drop(['PlayerID','Name','TARGET_5Yrs'] ,axis=1) data = data.fillna(data.mean()) test = test.drop(['PlayerID','Name'] ,axis=1) test = test.fillna(test.mean())",Yes,3,10.0 "quantile = QuantileTransformer(n_quantiles=3000) data[np.array(data.columns[:])] = quantile.fit_transform(data[np.array(data.columns[:])]) test[np.array(test.columns[:])] = quantile.transform(test[np.array(test.columns[:])])",No,5,8.0 "gradientBoosting_clf = GradientBoostingClassifier(n_estimators=350, learning_rate=.1,max_depth=1) gradientBoosting_clf.fit(data,input_label) gradientBoosting_given_labels = gradientBoosting_clf.predict(final)",Yes,3,4.0 "submition.iloc[:,1] = gradientBoosting_given_labels",No,5,14.0 "submition.to_csv(""submission_6.csv"", index=False)",No,5,25.0 "#imports import csv import numpy as np from sklearn import datasets from sklearn import svm from sklearn.preprocessing import Imputer import pandas as pd from sklearn.model_selection import cross_val_score from sklearn import preprocessing from sklearn import metrics from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA ",No,5,22.0 "#reading datasets train = pd.read_csv(""../input/trainecsv/train.csv"") test = pd.read_csv(""../input/iust-nba-rookies/test.csv"") # print(train.info()) #print(test.info())",No,5,45.0 "#DATA preporcessing #standardizing std_scale = preprocessing.StandardScaler().fit(train) train_std = std_scale.transform(train) test_std = std_scale.transform(test) # #PCA # pca_std = PCA(n_components=10).fit(train_std) # train_stdwPCA = pca_std.transform(train_std) # test_stdwPCA = pca_std.transform(test_std) #normalize train_normalized = preprocessing.normalize(train_std, norm='l2') test_normalized = preprocessing.normalize(test_std, norm='l2') ",No,5,18.0 "# #KNN (3) from sklearn.neighbors import KNeighborsClassifier # #find best k for knn # accs=[] # ks=[] # for k in range (1,50): # Tknn=KNeighborsClassifier(n_neighbors=k) # acc=cross_val_score(Tknn, train_normalized, train_labels, cv=10, scoring='accuracy') # accs.append(acc.mean()) # ks.append(k) # print('Best K value in KNN with Max Accuracy is :',(accs.index(max(accs))+1)) # print('Best Accuracy : ', max(accs)) # best_k = accs.index(max(accs))+1 #use best K for knn knn=KNeighborsClassifier(n_neighbors=2).fit(train_normalized,train_labels) #acc2=cross_val_score(knn, train_normalized, train_labels, cv=10, scoring='accuracy') trainpred=knn.predict(train_normalized) #testpred=knn.predict(test_normalized) #print(metrics.accuracy_score(train_labels, trainpred)) # print(acc2) # print(np.mean(acc2)) #results.append(knn.predict(test_normalized)) # print(knn.get_params().keys()) ",Yes,4,7.0 "#MLP (4) from sklearn.neural_network import MLPClassifier MLP = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(25, 10), random_state=1).fit(train_normalized,train_labels) trainpred=MLP.predict(train_normalized) #testpred=MLP.predict(test_normalized) #print(metrics.accuracy_score(train_labels, trainpred)) #acc3=cross_val_score(MLP, train_normalized, train_labels, cv=10, scoring='accuracy') # print(acc3) # print(np.mean(acc3)) #results.append(MLP.predict(test_normalized)) # print(MLP.get_params().keys()) ",Yes,3,7.0 "#GradientBoosting (7) from sklearn.ensemble import GradientBoostingClassifier GBC_clf = GradientBoostingClassifier(n_estimators=2000, learning_rate=0.008, max_depth=1, random_state=1).fit(train_normalized, train_labels) trainpred=GBC_clf.predict(train_normalized) #testpred=GBC_clf.predict(test_normalized) #print(metrics.accuracy_score(train_labels, trainpred)) # GBC_acc=cross_val_score(GBC_clf,train_normalized,train_labels,cv=20,scoring='accuracy') # print(GBC_acc) # print(np.mean(GBC_acc))",Yes,3,7.0 "#randomForest (8) from sklearn.ensemble import RandomForestClassifier random_forest_clf = RandomForestClassifier(n_estimators=100).fit(train_normalized,train_labels) #acc_random_forest = cross_val_score(random_forest_clf, train, train_labels, cv=10, scoring='accuracy') trainpred=random_forest_clf.predict(train_normalized) #testpred=random_forest_clf.predict(test_normalized) #print(metrics.accuracy_score(train_labels, trainpred)) # print(acc_random_forest) # print(np.mean(acc_random_forest)) #results.append(random_forest_clf.predict(test_normalized))",Yes,3,7.0 "#DecisionTree (9) from sklearn.tree import DecisionTreeClassifier DT_clf = DecisionTreeClassifier(max_depth=15, min_samples_split=3,random_state=6) DT_clf.fit(train_normalized,train_labels) trainpred=DT_clf.predict(train_normalized) #testpred=DT_clf.predict(test_normalized) #print(metrics.accuracy_score(train_labels, trainpred)) # DT_acc = cross_val_score(DT_clf, train_normalized, train_labels, cv=20, scoring='accuracy') # print(DT_acc) # print(np.mean(DT_acc)) ",Yes,3,7.0 "# ExtraTreesClassifier (10) from sklearn.ensemble import ExtraTreesClassifier ET_clf = ExtraTreesClassifier(n_estimators=30, max_depth=12,min_samples_split=3, random_state=0) ET_clf.fit(train_normalized,train_labels) trainpred=ET_clf.predict(train_normalized) #testpred=ET_clf.predict(test_normalized) #print(metrics.accuracy_score(train_labels, trainpred)) # ET_acc = cross_val_score(ET_clf, train_normalized, train_labels, cv=20, scoring='accuracy') # print(ET_acc) # print(np.mean(ET_acc)) ",Yes,3,7.0 "#AdaBoost Classifier (12) from sklearn.ensemble import AdaBoostClassifier AdB_clf = AdaBoostClassifier(n_estimators=450) AdB_clf.fit(train_normalized,train_labels) trainpred=AdB_clf.predict(train_normalized) #testpred=AdB_clf.predict(test_normalized) #print(metrics.accuracy_score(train_labels, trainpred)) # AdB_acc = cross_val_score(AdB_clf, train_normalized, train_labels, cv=20, scoring='accuracy') # print(AdB_acc) # print(np.mean(AdB_acc))",Yes,3,7.0 "#voting
from sklearn.ensemble import VotingClassifier
# from sklearn.model_selection import GridSearchCV

ens_clf=VotingClassifier(estimators=[(\'kn\', knn), (\'ml\', MLP),(\'gbc\', GBC_clf), (\'rf\', random_forest_clf),
(\'dt\', DT_clf), (\'et\', ET_clf), (\'adb\', AdB_clf)],
voting=\'soft\', weights=[1, 2, 3, 5, 5, 5, 4])


# grid = GridSearchCV(estimator=ens_clf, cv=5)

ens_clf.fit(train_normalized,train_labels)

#ens_acc = cross_val_score(ens_clf, train_normalized, train_labels, cv=10, scoring=\'accuracy\')


trainpredEns=ens_clf.predict(train_normalized)
#print(metrics.accuracy_score(train_labels, trainpredEns))

# print (ens_acc)
# print(np.mean(ens_acc))

print(""ENSDone"")",Yes,3,7.0 "#predicting results

result=ens_clf.predict(test_normalized)

cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': result }
submission = pd.DataFrame(cols)


submission.to_csv(""submission.csv"", index=False)

print(submission.info())
print (submission)
print(""done"")",Yes,3,48.0 "from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import Imputer from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.preprocessing import StandardScaler from statsmodels.compat import pandas as pd import pandas as pd from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn import svm, preprocessing from sklearn.decomposition import PCA",No,5,22.0 "test = pd.read_csv(""../input/test.csv"")
test_x = test.iloc[:, 2:].values
imp = Imputer(missing_values = \'NaN\', strategy = \'mean\', axis = 0)
imp=imp.fit(test_x)
test_x = imp.transform(test_x)
dataset = pd.read_csv(""../input/train.csv"")
print(dataset.info())
X = dataset.iloc[:, 2:-1].values
imputer = Imputer(missing_values = \'NaN\', strategy = \'mean\', axis = 0)
input_dim = X.shape[1]
imputer = imputer.fit(X)
X = imputer.transform(X)
y = dataset.iloc[:, 21].values

#Standard
sc = StandardScaler()
X_train = sc.fit_transform(X)
x_test = sc.transform(test_x)
#//////////////////////

#quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
#Xprime = quantile_transformer.fit_transform(X)
#txprime = quantile_transformer.transform(test_x)
#///////////////
Xtrain = preprocessing.normalize(X_train, norm=\'l2\')
Xtest = preprocessing.normalize(x_test, norm=\'l2\')
#/////////////
# from sklearn.preprocessing import PolynomialFeatures
# poly = PolynomialFeatures(2)
# Xprime = poly.fit_transform(X)
# tsprime = poly.transform(test_x)
#/////////////////
# pca = PCA(n_components=2)
# Xprime=pca.fit_transform(Xprime)
# tsprime = pca.fit_transform(test_x)
# print(Xprime.shape)
# print(tsprime)
#////////////////////

print(X)
print(y)
print(test_x)",Yes,3,17.0 "#Classifiers
#Choosing best one
#/////////////////////////
# model = Sequential()
# model.add(Dense(20, input_dim=input_dim))
# model.add(Activation(\'relu\'))
# model.add(Dropout(0.15))
# model.add(Dense(10))
# model.add(Activation(\'relu\'))
# model.add(Dropout(0.25))
# model.add(Dense(1))
# model.add(Activation(\'softmax\'))
#///////////////
#SVM-RBF
#from sklearn.svm import SVC
#classifier = SVC(kernel = \'rbf\')
#classifier.fit(X, y)
#
#y_predsvm = classifier.predict(test_x)
#SVC//////////////////

#clf = SVC()
#clf.fit(X,y)
#y_predsvc= clf.predict(test_x)
# preds = model.predict_classes(test_x, verbose=0)
#////////////////////
#Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X, y)
#print(""gnb:"", gnb.score(X,y))
#y_predgnb = gnb.predict(test_x)
#
# model.compile(optimizer=\'rmsprop\', loss=\'mae\')
#
#
# model.fit(X, y, epochs=10)
#//////////////////////////
#KNN
knn = KNeighborsClassifier(n_neighbors = 5, metric = \'minkowski\', p = 2)
knn.fit(X, y)
#y_predknn = knn.predict(test_x)
#print(""knn:"",knn.score(X,y))
#/////////////////
#Random Forest
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier(n_estimators = 100, criterion = \'entropy\')
rf.fit(X,y)
#y_predrf = rf.predict(test_x)
#print(""rf:"",rf.score(X,y))
#/////////////////////
#Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(random_state=0, learning_rate=0.01)
gb.fit(X, y)
#y_predgb = gb.predict(test_x)
#print(""gb:"", gb.score(X,y))
#////////////////
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X, y)
#y_predlr = lr.predict(test_x)
#print(""lr:"",lr.score(X,y))
#y_pred = gb.predict(test_x)
#y_pred = classifier.predict(test_x)
#//////////QDA
qda= QuadraticDiscriminantAnalysis()
qda.fit(X,y)
#y_predqda = qda.predict(test_x)
#///////
#SVM
#svm = svm.SVC(kernel=\'linear\', C = 1.0)
#svm.fit(X,y)
#y_predsvm = svm.predict(test_x)
#print(""svm"",svm.score(X,y))
# print(clf.predict(test_x))

#/////////////
#Adaboost
adb = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm=""SAMME"",n_estimators=200)

adb.fit(X,y)
#y_predadb = adb.predict(test_x)
#print(""adb:"",adb.score(X,y))
#///////////
# Voting Classifier(LR, RF,AdaBoost,SVM,GNB,Knn,GBC)

clf1 = LogisticRegression(random_state=100)
clf2 = RandomForestClassifier(n_estimators = 100, criterion = \'entropy\')
clf3 = SVC(gamma=2, C=1)
clf4 = GradientBoostingClassifier(random_state=0, learning_rate=0.01)
#clf3 = GradientBoostingClassifier(random_state=0, learning_rate=0.01)
clf5 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
algorithm=""SAMME"",
n_estimators=200)
clf6 = GaussianNB()
clf7 = KNeighborsClassifier(n_neighbors = 5, metric = \'minkowski\', p = 2)
#clf8 = svm
#clf9 = QuadraticDiscriminantAnalysis()
# Majority Vote
eclf1 = VotingClassifier(estimators=[(\'lr\', clf1), (\'rf\', clf2), (\'svm\', clf3), (\'gbc\', clf4),(\'adb\',clf5),(\'gnb\',clf6),(\'knn\',clf7)], voting=\'hard\')
eclf1 = eclf1.fit(X, y)
preds= eclf1.predict(test_x)

print(preds)
",Yes,3,7.0 "cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': [eclf1.predict([test_x[i]])[0] for i in range(440)] }
submission = pd.DataFrame(cols)
print(submission)
submission.to_csv(""submission1.csv"", index=False)",Yes,4,25.0 "import pandas as pd import numpy as np import sklearn from sklearn import preprocessing from sklearn.decomposition import PCA from sklearn.ensemble import IsolationForest from sklearn import svm import statistics import matplotlib.pyplot as plt from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis ",No,5,22.0 "t_data = pd.read_csv(""../input/train.csv"") ts_data = pd.read_csv(""../input/test.csv"")",No,5,45.0 "ddata = t_data.drop([""PlayerID"",""Name""], axis=1) sdata = ts_data.drop([""PlayerID"",""Name""], axis=1)",No,5,10.0 "ddata = ddata.interpolate() ddata = ddata.replace([np.inf], np.float64.max) ddata = ddata.replace([-np.inf], np.float64.min) features = ddata.loc[:, ddata.columns.values[:len(ddata.columns.values)-1]].values labels = ddata.loc[:, ['TARGET_5Yrs']].values st_features = preprocessing.StandardScaler().fit_transform(features) sdata = sdata.interpolate() sdata = sdata.replace([np.inf], np.float64.max) sdata = sdata.replace([-np.inf], np.float64.min) sfeatures = sdata.loc[:, sdata.columns.values].values st_sfeatures = preprocessing.StandardScaler().fit_transform(sfeatures)",Yes,3,17.0 "pca = PCA(n_components=10)

pca.fit(features)
principalComponents = pca.transform(features)
test_principalComponenta = pca.transform(sfeatures)
print(principalComponents.shape, ""\
"", test_principalComponenta.shape)",Yes,4,8.0 "names = [""Nearest Neighbors"", ""Linear SVM"", ""RBF SVM"", ""Gaussian Process"", ""Decision Tree"", ""Random Forest"", ""Neural Net"", ""AdaBoost"", ""Naive Bayes"", ""QDA""] classifiers = [ KNeighborsClassifier(3), SVC(kernel=""linear"", C=0.025), SVC(gamma=2, C=1), GaussianProcessClassifier(1.0 * RBF(1.0)), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), MLPClassifier(alpha=1), AdaBoostClassifier(), GaussianNB(), QuadraticDiscriminantAnalysis()] res = [] res2 = [] for name, clf in zip(names, classifiers): clf.fit(inputf, labels) res.append(clf.predict(testf)) temp = clf.predict(inputf) res2.append(temp) print(name, "" : "", sklearn.metrics.accuracy_score(labels, temp)) # res.append(res1)",Yes,3,4.0 "cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': res3 }
submission = pd.DataFrame(cols)
submission.to_csv(""submission.csv"", index=False)
print(submission)",Yes,4,25.0 "import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.utils import shuffle
from scipy.stats import mode
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
# from sklearn.ensemble import IsolationForest
from sklearn.ensemble import IsolationForest
from sklearn.naive_bayes import GaussianNB
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier

# =========================================================

names = [""Nearest Neighbors"", ""Linear SVM"", ""RBF SVM"",
""Decision Tree"", ""Random Forest"", ""Neural Net"", ""AdaBoost"",
""Naive Bayes"", ""Logistic Regression"", ""Bagging"", ""GradBoost"", ""ExtraTree""]#, ""Gaussian Process""]

classifiers = [
KNeighborsClassifier(n_neighbors=5),
SVC(kernel=""linear"", C = 0.6), #C=0.025
SVC(C = 0.6),
DecisionTreeClassifier(),
RandomForestClassifier(n_estimators=200, max_depth=2),
MLPClassifier(alpha=1),
AdaBoostClassifier(n_estimators=200),
GaussianNB(),
LogisticRegression(random_state=1),
BaggingClassifier(n_estimators=200),
GradientBoostingClassifier(n_estimators=350, learning_rate=.1,max_depth=2),
ExtraTreeClassifier()]#,
#GaussianProcessClassifier(1.0 * RBF(1.0))]

# =========================================================

# Read DataSet and put in X and y
dataset = pd.read_csv(\'../input/train.csv\')
dataset_test = pd.read_csv(""../input/test.csv"")
to_drop = [\'PlayerID\', \'Name\']
dataset.drop(to_drop, inplace=True, axis=1)
dataset_test.drop(to_drop, inplace=True, axis=1)

#dataset = dataset.interpolate(method=\'values\')
dataset = dataset.fillna(dataset.mean())
dataset_test = dataset_test.fillna(dataset_test.mean())

dataset = shuffle(dataset)

#--------------------------------------

X = dataset.iloc[:, 0:-1].values
X[:, 8] = 1
y = dataset.iloc[:, 19].values

X_value = dataset_test.iloc[:, 0:].values
X_value[:, 8] = 1
# =========================================================

# #preprocess dataset, split into training and test part
# sc = StandardScaler()
# X = sc.fit_transform(X)
# X_value = sc.fit_transform(X_value)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

# =========================================================

# iterate over classifiers
y_pred_matrix = []
for name, clf in zip(names, classifiers):
clf.fit(X_train, y_train)
#score = clf.score(X_test, y_test)
#print(name, score)
y_pred = clf.predict(X_test)
y_pred[y_pred==-1] = 0
y_pred_matrix.append(y_pred)
f1_scr = f1_score(y_test, y_pred, average=\'binary\')
print(name, f1_scr)

y_pred_matrix = np.array(y_pred_matrix)
final_pred = mode(y_pred_matrix, axis=0)[0]
final_pred = final_pred.flatten()
final_score = f1_score(y_test, final_pred, average=\'binary\')
print (final_score)

# =========================================================

#Voting by all classifiers
ZipList = list(zip(names, classifiers))
clf_Vot = VotingClassifier(estimators = ZipList, voting=\'hard\')
clf_Vot.fit(X_train, y_train)
#score = clf_Vot.score(X_test, y_test)
#print(score)
y_pred = clf_Vot.predict(X_test)
f1_scr = f1_score(y_test, y_pred, average=\'binary\')
print(f1_scr)

# =========================================================

clf_Vot.fit(X, y)
y_pre = clf_Vot.predict(X_value)
#print(y_pre)

cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': y_pre }
result = pd.DataFrame(cols)
result.to_csv(""Sub_Vot.csv"", index=False)
#result

# =========================================================

clf_GB = GradientBoostingClassifier(n_estimators=350, learning_rate=.1, max_depth=1)
clf_GB.fit(X, y)
y_pre = clf_GB.predict(X_value)
#print(y_pre)

cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': y_pre }
result = pd.DataFrame(cols)
result.to_csv(""Sub_GB.csv"", index=False)
#resultclf_GB = GradientBoostingClassifier(n_estimators=350, learning_rate=.1, max_depth=1)
clf_GB.fit(X, y)
y_pre = clf_GB.predict(X_value)
#print(y_pre)

cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': y_pre }
result = pd.DataFrame(cols)
result.to_csv(""Sub_GB.csv"", index=False)
#result

print(""Complete Runing!"")",No,2,22.0 "import numpy as np
import pandas as pd
import sklearn as sk
from sklearn import svm
from sklearn import ensemble
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

# PlayerID,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs

train = pd.read_csv(""../input/train.csv"")
test = pd.read_csv(""../input/test.csv"")
X = train.values[:,2:20]
y = train.values[:,21]
y = y.astype(\'int\')
print (X)
print (y)

test_X = test.values[:,2:20]
# test_y = test.values[:,21]
print (test_X)
# print (test_y)
",No,3,45.0 "# testing various classifiers to choose the best accuracy.

# linear SVM
# clf = svm.SVC()
# clf.fit(X, y)
#
# SVM with sigmoid kernel
# clf = svm.SVC(kernel=\'sigmoid\')
# clf.fit(X, y)
#
# SVM with rbf kernel
# clf = svm.SVC(kernel=\'rbf\')
# clf.fit(X, y)
#
# SVM with poly kernel
# clf = svm.SVC(kernel=\'poly\')
# clf.fit(X, y)
#
# adaboost
# clf = AdaBoostClassifier(n_estimators = 350)
# clf.fit(X, y)
#
# random forest
# clf = RandomForestClassifier(n_estimators = 250)
# clf.fit(X, y)
#
# decision tree
# clf = DecisionTreeClassifier()
# clf.fit(X, y)
#
# extra tree
# clf = ExtraTreesClassifier()
# clf.fit(X, y)
#
# gaussian naive bayes
# clf = GaussianNB()
# clf.fit(X, y)
#
# logistic regression
# clf = linear_model.LogisticRegression()
# clf.fit(X, y)
#
# stochastic gradient descent
# clf = SGDClassifier(loss=""squared_loss"", penalty=""l2"")
# clf = SGDClassifier(loss=""hinge"", penalty=""l2"")
# clf.fit(X, y)
#
# multi layer perceptron
# scaler = StandardScaler()
# scaler.fit(X)
# X = scaler.transform(X)
# test_X = scaler.transform(test_X)
# print (scaler)
# print (X)
# print (test_X)
# clf = MLPClassifier(solver=\'lbfgs\', alpha=1e-4, hidden_layer_sizes=(10,5), warm_start=\'True\')
# clf.fit(X, y)
#
# Gradient boosting
params = {\'n_estimators\': 2000, \'learning_rate\': 0.008}
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X, y)

cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': [clf.predict([test_X[i]])[0] for i in range(440)] }
submission = pd.DataFrame(cols)
print(submission)
submission.to_csv(""submission.csv"", index=False)",Yes,3,7.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import calendar
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sn
from scipy import stats
# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",No,5,88.0 "train=pd.read_csv('../input/train.csv') train.info()",Yes,4,45.0 "train[\'date\']=train.datetime.apply(lambda x: x.split()[0])
train[\'hour\']=train.datetime.apply(lambda x:x.split()[1].split(\':\')[0])
train[\'weekday\'] = train.date.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,""%Y-%m-%d"").weekday()])
train[\'month\']=train.date.apply(lambda dateString: calendar.month_name[datetime.strptime(dateString,\'%Y-%m-%d\').month])
train[\'season\']=train.season.map({1:\'Spring\',2:\'Summer\',3:\'Fall\',4:\'Winter\'})
train[\'weather\']=train.weather.map({1: "" Clear + Few clouds + Partly cloudy + Partly cloudy"",\\
2 : "" Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist "", \\
3 : "" Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds"", \\
4 :"" Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog "" })",No,5,8.0 "category_vars=['hour','weekday','month','season','weather','holiday','workingday'] for var in category_vars: train[var]=train[var].astype('category') train.info()",Yes,4,16.0 "train=train.drop('datetime',axis=1)",No,5,10.0 "#model.(valdn_x, valdn_y, batch_size=32, verbose=1) predictions = model.predict(test_x, batch_size=32, verbose=1)",No,5,48.0 "with open(\'submission.csv\',\'w\') as f:
f.write(\'id,label\
\')
for index in range(len(test_imgs)):
img_id =basename(test_imgs[index]).split(""."")[0]
prob = (predictions[index,0])
#print(""index: {}, img_id: {}, prob:{}"".format(index,img_id, prob))
f.write(""{},{}\
"".format(img_id, prob))",Yes,3,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",Yes,3,22.0 "from keras.layers import Conv2D,Dense,MaxPooling2D,BatchNormalization,Activation,Flatten from keras import Sequential from keras.initializers import glorot_normal from keras import optimizers from keras.models import Model from keras.applications.imagenet_utils import decode_predictions",No,5,22.0 "#################################TRYING OUT THE RESNET 50 ARCHITECTURE################################################ import random l=[1,2,3] ",Yes,3,22.0 "from keras.applications import VGG19 v=VGG19(weights=""imagenet"",include_top=False,input_shape=(120,120,3))",Yes,3,22.0 "#########defining the new model by defining my own last layer new_model=v.output new_model=Flatten()(new_model) new_model=Dense(10)(new_model) new_model=Activation(""relu"")(new_model) new_model=Dense(1,activation=""sigmoid"")(new_model) final_model=Model(input=v.input,output=new_model)",No,5,4.0 "###freezin all layers except from last 3 layers total_layers=len(final_model.layers) print(total_layers) for x in range(0,total_layers-4): final_model.layers[x].trainable=False # final_model.layers",No,5,84.0 "###train test split from sklearn.model_selection import train_test_split train_x,test_x,train_y,test_y=train_test_split(data,labels,test_size=0.2,random_state=100) ",No,5,13.0 "####compiling the model o=optimizers.adam() final_model.compile(loss=""binary_crossentropy"",metrics=[""accuracy""],optimizer=o)",Yes,4,59.0 "final_model.fit(train_x,train_y,batch_size=32,epochs=15,validation_split=0.2)",No,5,7.0 "preds=final_model.predict(test_x) new_preds=[] for x in preds: if x >0.5: new_preds.append(1) else: new_preds.append(0) new_preds=np.array(new_preds) new_preds=new_preds.reshape(len(new_preds),1) ",No,5,48.0 "sum(new_preds==test_y)/len(test_y) train_x=[] test_x=[] data=[] labels=[]",No,5,77.0 "########importing test file data=[] input_file_names=[] #####get the file names of the images to read them one by one for (dirpath, dirnames, filenames) in walk(""../input/dogs-vs-cats-redux-kernels-edition/test/""): input_file_names=filenames for x in input_file_names: img_file_name=x##getting name of the image file path=str(""../input/dogs-vs-cats-redux-kernels-edition/test/""+img_file_name)####making proper path of the image file i=image.load_img(path)####reading the image from the path i=i.resize((120,120))#####resizing the image iarray=image.img_to_array(i)####converting it to arrau data.append(iarray)#####appending the image to the list",No,5,84.0 "data=np.array(data)/255. preds=final_model.predict(data)",No,5,27.0 "df=pd.DataFrame({'id':new_input_test_file_names, 'label':preds})",No,5,12.0 "df.to_csv(""submission2.csv"",index=False)",No,5,25.0 "#import libraries import pandas as pd import numpy as np import os, random ,cv2 import keras from sklearn.model_selection import train_test_split from keras.models import Sequential from keras.layers import Conv2D, Dense, Flatten, Dropout, Activation, MaxPool2D from keras.optimizers import Adam, RMSprop from keras.losses import binary_crossentropy from keras.preprocessing.image import ImageDataGenerator import matplotlib.pyplot as plt ",No,5,22.0 "#specify train and test datasets paths train_path = '../input/train/' test_path = '../input/test/' #define image dimensions rows = 150 cols = 150 channels = 3",No,5,77.0 "#create a list of train image paths ""including image name""
train_images = [train_path+i for i in os.listdir(train_path)]
train_dogs = [train_path+i for i in os.listdir(train_path) if \'dog\' in i]
train_cats = [train_path+i for i in os.listdir(train_path) if \'cat\' in i]",No,5,77.0 "#create a list of test image paths ""including image name"" test_images = [test_path+i for i in os.listdir(test_path)]",No,5,77.0 "train_images = train_dogs[:3000] + train_cats[:3000] #randomly shuffle train images random.shuffle(train_images)",Yes,3,21.0 "def prep_data(image_path_list): x=[] y=[] for i in image_path_list: x.append(cv2.resize(plt.imread(i), #read then resize image (rows,cols), interpolation=cv2.INTER_CUBIC)) #appened new image to x for j in image_path_list: #create a label and append it to y if 'dog' in j: y.append(1) elif 'cat' in j: y.append(0) return x,y ",No,5,21.0 "X, y = prep_data(train_images)",No,5,21.0 "#split X,y into a train and validation data sets X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=(1/3), random_state=1)",No,5,13.0 "X_test, y_test = prep_data(test_images)",No,5,21.0 "#create a keras CNN model from sctarch model = Sequential() model.add(Conv2D(32,(3,3), input_shape=(rows, cols, 3))) model.add(Activation('relu')) model.add(MaxPool2D(pool_size=(2,2))) model.add(Conv2D(64,(3,3))) model.add(Activation('relu')) model.add(MaxPool2D(pool_size=(2,2))) model.add(Conv2D(128,(3,3))) model.add(Activation('relu')) model.add(MaxPool2D(pool_size=(2,2))) model.add(Conv2D(256,(3,3))) model.add(Activation('relu')) model.add(MaxPool2D(pool_size=(2,2))) model.add(Flatten()) model.add(Dense(256)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(256)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(1)) model.add(Activation('sigmoid'))",No,5,4.0 "model.compile(optimizer='RMSprop', metrics=['accuracy'], loss='binary_crossentropy')",No,5,84.0 "#create a data generator object with some image augmentation specs datagen = ImageDataGenerator( rescale=1./ 255, shear_range=0.2, zoom_range=0.2, horizontal_flip=True ) ",No,5,84.0 "#create an iterator for both train and valid sets train_gen = datagen.flow(x=np.array(X_train), y=y_train, batch_size=50) valid_gen = datagen.flow(x=np.array(X_val), y=y_val, batch_size=50)",No,5,77.0 "#train/validate model model.fit_generator(train_gen, steps_per_epoch=40, epochs=50, verbose=1, validation_data=valid_gen, validation_steps=20)",No,5,7.0 "#create a data generator object for testing datagen = ImageDataGenerator(rescale = 1./255)",No,5,84.0 "test_gen = datagen.flow(np.array(X_test), batch_size = 100)",No,5,84.0 "#predict predictions = model.predict_generator(test_gen, steps=125, verbose=1)",No,5,48.0 "#submit id_num = range(1, len(predictions_dogs) + 1) submission = pd.DataFrame({""id"": id_num, ""label"":predictions_dogs}) submission.to_csv(""submission.csv"", index = False)",No,5,25.0 "import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import IPython.display as ipd
import keras
from keras.models import *
from keras.callbacks import *
from keras.layers import *
from keras.preprocessing.image import random_brightness,random_rotation,random_shear,random_shift,random_zoom
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
import skimage
import skimage.transform
import skimage.color
import imageio
import matplotlib.pyplot as plt
import seaborn as sns
import os

TRAIN_DIR = ""../input/train/""
TEST_DIR = ""../input/test/""

train = pd.DataFrame()
train[\'file\'] = os.listdir(TRAIN_DIR)
train[\'class\'] = train[\'file\'].apply(lambda x: x.split(\'.\')[0])
train[\'class_id\'] = train[\'class\'].apply(lambda x: 0 if x==\'cat\' else 1)
test = pd.DataFrame()
test[\'file\'] = os.listdir(TEST_DIR)
test[\'id\'] = test[\'file\'].apply(lambda x: x.split(\'.\')[0])
test[\'label\'] = 0.5

train.head()",No,3,22.0 "sns.countplot(x='class', data=train);",No,5,33.0 "def make_model(size=(256,256)): def make_cnn(kernel_nums, x): for n in kernel_nums: x = Conv2D(n, kernel_size=3, strides=1, activation='relu', padding='same')(x) x = MaxPooling2D(pool_size=2, strides=2, padding='same')(x) x = BatchNormalization()(x) x = SpatialDropout2D(0.3)(x) return Flatten()(x) inp = Input((size[0],size[1],1)) kernel_nums = [64, 64,128,128,256,256,512,512] scaled = inp cnn_outs = [] for i in range(6): scaled = AveragePooling2D(pool_size=2**i, strides=2**i)(inp) cnn_outs.append(make_cnn(kernel_nums[:len(kernel_nums)-i], scaled)) x = concatenate(cnn_outs) x = Dense(512, activation='relu')(x) x = Dropout(0.3)(x) out = Dense(1, activation='sigmoid')(x) return Model(inp, out)",No,5,4.0 "SIZE = (256,256) model = make_model(size=SIZE) model.summary() #keras.utils.plot_model(model, to_file='model.png', show_shapes=True) #ipd.Image(filename='model.png')",No,5,4.0 "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) batch_size=150 X_train, X_valid, y_train, y_valid = train_test_split(train['file'].values, train['class_id'].values, test_size=0.1) model.fit_generator(image_generator(X_train, TRAIN_DIR, labels=y_train, size=SIZE, batch_size=batch_size, random_preproc=False, rotation_range=10, shear_range=5, shift_range=(0.1,0.1), zoom_range=(0.8,1.2)), epochs=25, steps_per_epoch=int(math.ceil(len(y_train)/batch_size)), validation_data=image_generator(X_valid, TRAIN_DIR, labels=y_valid, size=SIZE, batch_size=batch_size, random_preproc=False), validation_steps=int(math.ceil(len(y_valid)/batch_size)), callbacks=[EarlyStopping(monitor='val_loss',patience=3,verbose=0)], verbose=1 )",Yes,4,7.0 "predicted_probs = model.predict_generator(image_generator(X_valid, TRAIN_DIR, size=SIZE, batch_size=batch_size, random_preproc=False), steps=int(math.ceil(len(y_valid)/batch_size)) ) predicted = np.round(predicted_probs) print(classification_report(y_valid, predicted)) print(log_loss(y_valid, predicted_probs)) sns.heatmap(confusion_matrix(y_valid, predicted), annot=True);",Yes,3,48.0 "predicted_probs = model.predict_generator(image_generator(test['file'], TEST_DIR, size=SIZE, batch_size=batch_size, random_preproc=False), steps=int(math.ceil(len(test['file'])/batch_size)) ) test['label'] = predicted_probs test[['id','label']].to_csv('submission.csv', index=False)",Yes,3,48.0 "PATH = ""../input/"" TMP_PATH = ""/tmp/tmp"" MODEL_PATH = ""/tmp/model/"" sz=224",No,5,77.0 "fnames = np.array([f'train/{f}' for f in sorted(os.listdir(f'{PATH}train'))]) labels = np.array([(0 if 'cat' in fname else 1) for fname in fnames])",No,5,77.0 "from fastai.imports import * from fastai.transforms import * from fastai.conv_learner import * from fastai.model import * from fastai.dataset import * from fastai.sgdr import * from fastai.plots import *",No,5,22.0 "arch=resnet50 ",No,5,4.0 "data = ImageClassifierData.from_names_and_array( path=PATH, fnames=fnames, y=labels, classes=['dogs', 'cats'], test_name='test', tfms= tfms_from_model(resnet34, sz, aug_tfms=transforms_side_on, max_zoom=1.1) #data augmentation ) learn = ConvLearner.pretrained(arch, data, precompute=True, tmp_name=TMP_PATH, models_name=MODEL_PATH)",No,5,30.0 "learn.fit(0.01,4)",No,5,7.0 "learn.precompute=False learn.fit(1e-2, 3, cycle_len=2)",No,5,7.0 "lr=np.array([1e-4,1e-3,1e-2])",No,5,5.0 "learn.fit(lr, 3, cycle_len=1, cycle_mult=2)",No,5,7.0 "log_predictions,y = learn.TTA(is_test=True) prob_predictions = np.mean(np.exp(log_predictions),0) probs = prob_predictions[:,1]",No,5,8.0 log_predictions.shape,No,5,58.0 ids= fnames = np.array([f'{f}' for f in os.listdir(f'{PATH}test')]),No,5,77.0 "ids= [i.replace("".jpg"","""") for i in ids] ids[0]",No,5,77.0 "ans= pd.DataFrame({""id"":ids,""label"":probs})
ans= ans.sort_values(\'id\')
ans.head()",Yes,3,9.0 ans.describe(),No,5,40.0 "cm = confusion_matrix(y, valid_preds) plot_confusion_matrix(cm, data.classes)",No,5,56.0 "ans.to_csv('submission.csv', index=False)",No,5,25.0 "from sklearn import pipeline,ensemble,preprocessing, feature_extraction from sklearn.feature_extraction.text import TfidfTransformer from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, accuracy_score from collections import Counter from sklearn.svm import LinearSVC from sklearn.model_selection import cross_validate #from sklearn import tree #from sklearn.naive_bayes import MultinomialNB #from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier #from sklearn.linear_model import SGDClassifier #from sklearn.linear_model import LogisticRegression",No,5,22.0 train = pd.read_json('../input/train.json'),No,5,44.0 %matplotlib inline,No,4,23.0 "import matplotlib.pyplot as plt plt.style.use('ggplot')",Yes,3,22.0 train['cuisine'].value_counts().plot(kind='bar'),No,5,33.0 "top10= pd.DataFrame([[items[0] for items in counters[cuisine].most_common(10)] for cuisine in counters], index=[cuisine for cuisine in counters], columns=['top{}'.format(i) for i in range(1,11)]) top10",No,4,12.0 "indices=train.ingredients.str.contains('garlic cloves') train[indices]['cuisine'].value_counts().plot(kind='bar', title= 'Dientes de ajo hallados por cocina')",No,5,33.0 "unique= np.unique(top10.values.ravel()) unique fig, axes= plt.subplots(8,8, figsize=(20,20)) for ingredient, ax_index in zip(unique, range(64)): indices=train.ingredients.str.contains(ingredient) relative_freq= (train[indices]['cuisine'].value_counts()/train['cuisine'].value_counts()) relative_freq.plot(kind='bar', ax=axes.ravel()[ax_index], fontsize=8, title=ingredient)",No,5,33.0 train.isnull().sum(),No,5,39.0 "fig,axes=plt.subplots(nrows=2,ncols=2) sn.boxplot(data=train,y='count',orient='v',ax=axes[0][0]) sn.boxplot(data=train,y='count',x='season',orient='v',ax=axes[0][1]) sn.boxplot(data=train,y='count',x='hour',orient='v',ax=axes[1][0]) sn.boxplot(data=train,y='count',x='workingday',orient='v',ax=axes[1][1])",No,5,33.0 trainwo=train[np.abs(train['count']-train['count'].mean())<=3*train['count'].std()],No,5,14.0 "print('Shape of the DataFrame with outliers: ', train.shape) print('Shape of the DataFrame without outliers: ', trainwo.shape)",No,5,58.0 "corr=train[[""temp"",""atemp"",""casual"",""registered"",""humidity"",""windspeed"",""count""]].corr() mask=np.array(corr) mask[np.tril_indices_from(mask)]=False fig,ax=plt.subplots() sn.heatmap(corr,mask=mask,vmax=.8,square=True,annot=True)",Yes,4,80.0 "clf=pipeline.Pipeline([ ('tfidf_vectorizer', feature_extraction.text.TfidfVectorizer(lowercase=True)), ('clf', LinearSVC(random_state=0)) ])",Yes,3,4.0 "# step 1: testing X_train,X_test,y_train,y_test=train_test_split(train.ingredients,train.cuisine, test_size=0.2)",No,5,13.0 "clf.fit(X_train, y_train)",No,5,7.0 y_pred = clf.predict(X_test),No,5,48.0 "confusion_matrix(y_test, y_pred)",No,5,49.0 "accuracy_score(y_test, y_pred)",No,5,49.0 "# step 2: real training test=pd.read_json('../input/test.json')",No,5,44.0 test.ingredients=test.ingredients.apply(' '.join),No,5,78.0 "clf.fit(train.ingredients,train.cuisine)",No,5,7.0 pred=clf.predict(test.ingredients),No,5,48.0 "df=pd.DataFrame({'id':test.id,'cuisine':pred})",No,5,12.0 "df.to_csv('LinearSVC.csv', columns=['id','cuisine'],index=False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("".""))

# Any results you write to the current directory are saved as output.",No,5,88.0 "from matplotlib import pyplot as plt import seaborn as sns sns.set() %matplotlib inline %config InlineBackend.figure_format = 'retina'",No,5,23.0 "df_store = pd.read_csv('../input/store.csv') df = pd.read_csv('../input/train.csv', low_memory=False) df = df.merge(df_store, on='Store')",Yes,4,45.0 "df_test = pd.read_csv('../input/test.csv', low_memory=False) df_test.head()",Yes,4,45.0 df.head(5),No,5,41.0 "df['Date'] = pd.to_datetime(df['Date']) df['Month'] = df.Date.apply(lambda dt: dt.month) df['Year'] = df.Date.apply(lambda dt: dt.year) df['WeekOfYear'] = df.Date.apply(lambda dt: dt.weekofyear) df['Day'] = df.Date.apply(lambda dt: dt.day) df['isMonthEnd'] = df.Date.apply(lambda dt: dt.is_month_end) df['isMonthStart'] = df.Date.apply(lambda dt: dt.is_month_start) df['isQuarterEnd'] = df.Date.apply(lambda dt: dt.is_quarter_end ) df['isQuarterStart'] = df.Date.apply(lambda dt: dt.is_quarter_start) df['isYearEnd'] = df.Date.apply(lambda dt: dt.is_year_end) df['isYearStart'] = df.Date.apply(lambda dt: dt.is_year_start)",No,5,8.0 "features = [] for feat in df.columns.drop('Sales'): if df[feat].dtype == np.float64 or df[feat].dtype == np.int64: features.append(feat)",No,5,77.0 "fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(20, 20)); df_sample = df.sample(frac=0.05) for idx, feature in enumerate(features): df_sample.plot(feature, ""Sales"", subplots=True, kind=""scatter"", ax=axes[idx // 4, idx % 4]);",No,5,33.0 # Customers Sales ( Open/Promo). -
# Promo2

df[df.columns.drop('Sales')].corrwith(df.Sales),No,5,40.0 "# ""b""

df.groupby(\'StoreType\')[\'Sales\'].mean()",No,5,60.0 sns.distplot(df.Sales[df.Sales > 0]),No,5,33.0 df.info(),No,5,40.0 "# Promo2, ~SinceWeek ~SinceYear
df[(pd.isnull(df.Promo2SinceWeek) | pd.isnull(df.Promo2SinceYear)) & df.Promo2 != 0]",No,5,14.0 "df['CompetitionOpenSinceMonth'].fillna(0, inplace=True) df['CompetitionOpenSinceYear'].fillna(0, inplace=True)",No,5,17.0 "df['Promo2SinceWeek'].fillna(0, inplace=True) df['Promo2SinceYear'].fillna(0, inplace=True)",No,5,17.0 "df['CompetitionDistance'].fillna(df['CompetitionDistance'].median(), inplace=True) df['CompetitionDistance'] = np.log(df.CompetitionDistance) + 1",Yes,4,17.0 "df.sample(frac=.001).plot(\'CompetitionDistance\', ""Sales"", subplots=True, kind=""scatter"")",No,5,33.0 "# ,
df.groupby('Store')['CompetitionDistance'].unique().apply(lambda l: 1 if len(l) > 1 else 0).sum()",No,5,60.0 "# ,
df['StateHoliday'] = df['StateHoliday'].replace(0, '0')
df['Holiday'] = df.StateHoliday.apply(lambda x: 0 if x == '0' else 1)

df.drop('StateHoliday', axis=1, inplace=True)",Yes,4,8.0 "df = df.sort_values(by='Date') df.drop('Date', axis=1, inplace=True)",Yes,3,9.0 "df = df[(df['Open'] != 0) & (df['Sales'] != 0)] df.drop('Open', axis=1, inplace=True)",Yes,3,14.0 #

df.PromoInterval.value_counts(),No,5,72.0 "df['isMonthEnd'] = df['isMonthEnd'].astype(int) df['isMonthStart'] = df['isMonthStart'].astype(int) df['isQuarterEnd'] = df['isQuarterEnd'].astype(int) df['isQuarterStart'] = df['isQuarterStart'].astype(int) df['isYearEnd'] = df['isYearEnd'].astype(int) df['isYearStart'] = df['isYearStart'].astype(int)",No,5,16.0 "# competition open time (in months)
df['CompetitionOpen'] = 12 * (df.Year - df.CompetitionOpenSinceYear) + \\
(df.Month - df.CompetitionOpenSinceMonth)

# Promo open time
df['PromoOpen'] = 12 * (df.Year - df.Promo2SinceYear) + \\
(df.WeekOfYear - df.Promo2SinceWeek) / 4.0

df = pd.get_dummies(df, columns=['DayOfWeek', 'StoreType', 'Assortment','PromoInterval'], dummy_na=True)",Yes,4,8.0 "import xgboost as xgb
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

def train(index, train, hp_selection=False):
train_store = train[index]
X = train_store[train_store.columns.drop([\'Sales\', \'Store\', \'Customers\'])]
y = train_store[\'Sales\']

train_size = int(X.shape[0]*.99)
print(f\'Regressor for {index} store\
Training on {X.shape[0]} samples\')
X_train, y_train = X.iloc[:train_size], y.iloc[:train_size]
X_test, y_test = X.iloc[train_size:], y.iloc[train_size:]

xtrain = xgb.DMatrix(X_train, np.log(y_train.values) + 1)
xtest = xgb.DMatrix(X_test, np.log(y_test.values) + 1)

if hp_selection:
def score(params):
num_round = 200
model = xgb.train(params, xtrain, num_round, feval=rmspe_xg)
predictions = model.predict(xtest)
score = rmspe(y=y_test, yhat=predictions)
return {\'loss\': score, \'status\': STATUS_OK}

def optimize(trials):
space = {
\'n_estimators\' : hp.quniform(\'n_estimators\', 1, 1000, 1),
\'eta\' : hp.quniform(\'eta\', 0.2, 0.825, 0.025),
\'max_depth\' : hp.choice(\'max_depth\', np.arange(1, 14, dtype=int)),
\'min_child_weight\' : hp.quniform(\'min_child_weight\', 1, 6, 1),
\'subsample\' : hp.quniform(\'subsample\', 0.7, 1, 0.05),
\'gamma\' : hp.quniform(\'gamma\', 0.5, 1, 0.05),
\'colsample_bytree\' : hp.quniform(\'colsample_bytree\', 0.5, 1, 0.05),
\'eval_metric\': \'rmse\',
\'objective\': \'reg:linear\',
\'nthread\': 4,
\'silent\' : 1
}

best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)
return best

trials = Trials()
best_opts = optimize(trials)
best_opts[\'silent\'] = 1
else:
best_opts = {\'colsample_bytree\': 0.7,
\'eta\': 0.625,
\'gamma\': 0.8,
\'max_depth\': 6,
\'eval_metric\': \'rmse\',
\'min_child_weight\': 6.0,
\'n_estimators\': 8.0, # 585
\'silent\': 1,
\'nthread\': 4,
\'subsample\': 0.95}

watchlist = [(xtrain, \'train\'), (xtest, \'eval\')]
num_round = 10000
regressor = xgb.train(best_opts, xtrain, num_round, watchlist, feval=rmspe_xg,
verbose_eval=10, early_stopping_rounds=50)
print(""Validating"")
train_probs = regressor.predict(xtest)
indices = train_probs < 0
train_probs[indices] = 0
error = rmspe(np.exp(train_probs) - 1, y_test.values)
print(\'error\', error)
regressor = xgb.train(best_opts, xtest, 10, feval=rmspe_xg, xgb_model=regressor)
return regressor",Yes,2,2.0 "df_test = pd.read_csv(\'../input/test.csv\', low_memory=False)
closed_store_ids = df_test[""Id""][df_test[""Open""] == 0].values

df_test = df_test.merge(df_store, on=\'Store\')
df_test[\'Date\'] = pd.to_datetime(df_test[\'Date\'])
df_test[\'Month\'] = df_test.Date.apply(lambda dt: dt.month)
df_test[\'Year\'] = df_test.Date.apply(lambda dt: dt.year)
df_test[\'WeekOfYear\'] = df_test.Date.apply(lambda dt: dt.weekofyear)
df_test[\'Day\'] = df_test.Date.apply(lambda dt: dt.day)

df_test[\'isMonthEnd\'] = df_test.Date.apply(lambda dt: dt.is_month_end).astype(int)
df_test[\'isMonthStart\'] = df_test.Date.apply(lambda dt: dt.is_month_start).astype(int)
df_test[\'isQuarterEnd\'] = df_test.Date.apply(lambda dt: dt.is_quarter_end ).astype(int)
df_test[\'isQuarterStart\'] = df_test.Date.apply(lambda dt: dt.is_quarter_start).astype(int)
df_test[\'isYearEnd\'] = df_test.Date.apply(lambda dt: dt.is_year_end).astype(int)
df_test[\'isYearStart\'] = df_test.Date.apply(lambda dt: dt.is_year_start).astype(int)

df_test[\'CompetitionOpenSinceMonth\'].fillna(0, inplace=True)
df_test[\'CompetitionOpenSinceYear\'].fillna(0, inplace=True)

df_test[\'Promo2SinceWeek\'].fillna(0, inplace=True)
df_test[\'Promo2SinceYear\'].fillna(0, inplace=True)

df_test[\'CompetitionDistance\'].fillna(df_test[\'CompetitionDistance\'].median(), inplace=True)

df_test[\'StateHoliday\'] = df_test[\'StateHoliday\'].replace(0, \'0\')
df_test[\'Holiday\'] = df_test.StateHoliday.apply(lambda x: 0 if x == \'0\' else 1)

df_test.drop(\'StateHoliday\', axis=1, inplace=True)
df_test.drop(\'Date\', axis=1, inplace=True)

# competition open time (in months)
df_test[\'CompetitionOpen\'] = 12 * (df_test.Year - df_test.CompetitionOpenSinceYear) + \\
(df_test.Month - df_test.CompetitionOpenSinceMonth)

# Promo open time
df_test[\'PromoOpen\'] = 12 * (df_test.Year - df_test.Promo2SinceYear) + \\
(df_test.WeekOfYear - df_test.Promo2SinceWeek) / 4.0

df_test.drop([\'Open\'], axis=1, inplace=True)

df_test = pd.get_dummies(df_test, columns=[\'DayOfWeek\', \'StoreType\', \'Assortment\',\'PromoInterval\'], dummy_na=True)
",Yes,4,8.0 "fig,(ax1,ax2,ax3)=plt.subplots(ncols=3) sn.regplot(x='temp',y='count',data=train,ax=ax1) sn.regplot(x='windspeed',y='count',data=train,ax=ax2) sn.regplot(x='humidity',y='count',data=train,ax=ax3)",No,5,33.0 "train=pd.read_csv(\'../input/train.csv\')
test=pd.read_csv(""../input/test.csv"")",No,5,45.0 "data=train.append(test) data.reset_index(inplace=True) data.drop('index',inplace=True,axis=1) ",Yes,4,10.0 "data[\'date\']=data[\'datetime\'].apply(lambda x:x.split()[0])
data[\'hour\']=data[\'datetime\'].apply(lambda x:x.split()[1].split(\':\')[0])
data[\'year\']=data[\'date\'].apply(lambda x:x.split(\'-\')[0])
data[\'month\']=data[\'date\'].apply(lambda x:datetime.strptime(x,""%Y-%m-%d"").month)
data[\'weekday\']=data[\'date\'].apply(lambda x:datetime.strptime(x,""%Y-%m-%d"").weekday())",No,5,8.0 "from sklearn.ensemble import RandomForestRegressor wind0=data[data['windspeed']==0] windNot0=data[data['windspeed']!=0] rf_wind=RandomForestRegressor() wind_cols=['season','weather','year','month','temp','atemp','humidity'] rf_wind.fit(windNot0[wind_cols],windNot0['windspeed']) pred=rf_wind.predict(X=wind0[wind_cols]) wind0['windspeed']=pred data=windNot0.append(wind0) data.reset_index(inplace=True) data.drop('index',axis=1,inplace=True) ",Yes,2,7.0 "categorical_features=['season','month','year','workingday','holiday','weather','hour'] numerical_features=['humidity','windspeed','temp','atemp'] drop_features=['casual','registered','datetime','date','count'] for var in categorical_features: data[var]=data[var].astype('category')",No,5,16.0 "train=data[pd.notnull(data['count'])].sort_values(by=['datetime']) test=data[~pd.notnull(data['count'])].sort_values(by='datetime') datetimecol=test['datetime'] y_train=train['count'] y_train_registered=train['registered'] y_train_casual=train['casual'] train=train.drop(drop_features,axis=1) test=test.drop(drop_features,axis=1)",No,5,21.0 "from sklearn.linear_model import LinearRegression, Ridge, Lasso from sklearn.model_selection import GridSearchCV from sklearn import metrics import warnings pd.options.mode.chained_assignment = None warnings.filterwarnings(""ignore"", category=DeprecationWarning) lr=LinearRegression() y_train_log=np.log1p(y_train) lr.fit(train,y_train_log) pred=lr.predict(train) print(""RMSLE Value For Linear Regression: "",rmsle(y_train_log,pred,True)) ",Yes,2,22.0 df_submission,No,5,41.0 "df_submission.to_csv('submission.csv', index=False)",No,5,25.0 "from sklearn.ensemble import RandomForestRegressor rf=RandomForestRegressor(n_estimators=100) y_train_log=np.log1p(y_train) rf.fit(train,y_train_log) pred=rf.predict(train) print(""RMSLE Value For Random Forest: "", rmsle(y_train_log,pred,True))",Yes,3,7.0 "# def score(params):
# print(""Training with params : "")
# print(params)
# num_round = int(params[\'n_estimators\'])
# model = xgb.train(params, xtrain, num_round, feval=rmspe_xg)
# predictions = model.predict(xtest)
# score = rmspe(y=y_test, yhat=predictions)
# br = \'-\'*124
# print(f\'{br}\
\\tScore of RMSPE: {score}\
{br}\')
# return {\'loss\': score, \'status\': STATUS_OK}

# def optimize(trials):
# space = {
# \'n_estimators\' : hp.quniform(\'n_estimators\', 1, 1000, 1),
# \'eta\' : hp.quniform(\'eta\', 0.3, 0.825, 0.025),
# \'max_depth\' : hp.choice(\'max_depth\', np.arange(1, 14, dtype=int)),
# \'min_child_weight\' : hp.quniform(\'min_child_weight\', 1, 6, 1),
# \'subsample\' : hp.quniform(\'subsample\', 0.7, 1, 0.05),
# \'gamma\' : hp.quniform(\'gamma\', 0.5, 1, 0.05),
# \'colsample_bytree\' : hp.quniform(\'colsample_bytree\', 0.5, 1, 0.05),
# \'eval_metric\': \'rmse\',
# \'objective\': \'reg:linear\',
# \'nthread\': 4,
# \'silent\' : 1
# }

# best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)

# print(best)
# return best


# trials = Trials()
# best_opts = optimize(trials)
",No,5,53.0 "# def score(params):
# print(""Training with params : "")
# print(params)
# num_round = 25 # int(params[\'n_estimators\'])
# # del params[\'n_estimators\']
# dtrain = xgb.DMatrix(X_train, label=y_train)
# dvalid = xgb.DMatrix(X_test, label=y_test)
# model = xgb.train(params, dtrain, num_round)
# predictions = model.predict(dvalid)
# score = mae(y_test, predictions)
# br = \'-\'*130
# print(f\'{br}\
\\tScore of MAE: {score}\
{br}\')
# return {\'loss\': score, \'status\': STATUS_OK}

# def optimize(trials):
# space = {
# \'n_estimators\' : hp.quniform(\'n_estimators\', 100, 1000, 1),
# \'eta\' : hp.quniform(\'eta\', 0.4, 0.825, 0.025),
# \'max_depth\' : hp.choice(\'max_depth\', np.arange(1, 14, dtype=int)),
# \'min_child_weight\' : hp.quniform(\'min_child_weight\', 1, 6, 1),
# \'subsample\' : hp.quniform(\'subsample\', 0.5, 1, 0.05),
# \'gamma\' : hp.quniform(\'gamma\', 0.5, 1, 0.05),
# \'colsample_bytree\' : hp.quniform(\'colsample_bytree\', 0.5, 1, 0.05),
# \'eval_metric\': \'mae\',
# \'objective\': \'reg:linear\',
# \'nthread\': 4,
# \'silent\' : 1
# }

# best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=50)

# print(best)

# trials = Trials()
# optimize(trials)",No,5,53.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgbm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import optimizers

import warnings
warnings.filterwarnings(\'ignore\')

import os
print(os.listdir(""../input""))

import regex as re
import gc
# Any results you write to the current directory are saved as output.",No,4,88.0 "baseline_tree_score = 0.23092278864723115 baseline_neuralnetwork_score = 0.5480561937041435",No,5,77.0 "train = pd.read_csv('../input/kaggletutorial/covertype_train.csv') test = pd.read_csv('../input/kaggletutorial/covertype_test.csv')",No,5,45.0 train_index = train.shape[0],No,5,77.0 "lgbm_param = {
\'boosting_type\': \'gbdt\',
\'objective\': \'binary\',
\'metric\': \'binary_logloss\',
""learning_rate"": 0.06,
""num_leaves"": 16,
""max_depth"": 6,
""colsample_bytree"": 0.7,
""subsample"": 0.8,
""reg_alpha"": 0.1,
""reg_lambda"": 0.1,
""nthread"":8
}",No,5,59.0 "def keras_model(input_dims): model = Sequential() model.add(Dense(input_dims, input_dim=input_dims)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(input_dims//2)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dropout(0.2)) # output layer (y_pred) model.add(Dense(1)) model.add(Activation('sigmoid')) # compile this model model.compile(loss='binary_crossentropy', # one may use 'mean_absolute_error' as alternative optimizer='adam', metrics=['accuracy']) return model def keras_history_plot(history): plt.plot(history.history['loss'], 'y', label='train loss') plt.plot(history.history['val_loss'], 'r', label='val loss') plt.xlabel('epoch') plt.ylabel('loss') plt.legend(loc='upper right') plt.show()",Yes,4,4.0 "from sklearn.ensemble import GradientBoostingRegressor gbr=GradientBoostingRegressor(n_estimators=4000,alpha=0.01) y_train_log=np.log1p(y_train) gbr.fit(train,y_train_log) pred=gbr.predict(train) print(""RMSLE Value For Gradient Boost: "", rmsle(y_train_log,pred,True))",Yes,3,7.0 "lgbm_param = {
\'boosting_type\': \'gbdt\',
\'objective\': \'binary\',
\'metric\': \'binary_logloss\',
""learning_rate"": 0.03,
""num_leaves"": 24,
""max_depth"": 6,
""colsample_bytree"": 0.65,
""subsample"": 0.7,
""reg_alpha"": 0.1,
""reg_lambda"": 0.2,
""nthread"":8
}",No,5,59.0 "pred_test=gbr.predict(test) fig,(ax1,ax2)=plt.subplots(ncols=2) sn.distplot(y_train,ax=ax1,bins=50) sn.distplot(np.exp(pred_test),ax=ax2,bins=50)",Yes,3,48.0 "submission=pd.DataFrame({ 'datetime':datetimecol, 'count':[max(0,x) for x in np.exp(pred_test)] }) submission.to_csv('bike_predictions_gbm.csv',index=False)",Yes,4,25.0 "import os print((os.listdir('../input/')))",No,5,88.0 "import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import roc_auc_score",No,5,22.0 "df_train = pd.read_csv('../input/web-club-recruitment-2018/train.csv') df_test = pd.read_csv('../input/web-club-recruitment-2018/test.csv') feature_cols=['X1','X2','X3','X5','X6','X7','X8','X9','X10','X12','X13','X14','X15','X16','X17','X18','X19','X20','X21','X22','X23']",No,5,45.0 "dtrain = lgbm.Dataset(train_df, label=y_value) clf = lgbm.train(lgbm_param, train_set=dtrain, num_boost_round=5000) predict = clf.predict(test_df)",Yes,3,7.0 "submission = pd.read_csv('../input/kaggletutorial/sample_submission.csv') submission['Cover_Type'] = predict submission.to_csv('lgbm_last.csv', index=False)",No,3,25.0 "def keras_model(input_dims): model = Sequential() model.add(Dense(input_dims, input_dim=input_dims)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(input_dims)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(input_dims//2)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(input_dims//5)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dropout(0.3)) # output layer (y_pred) model.add(Dense(1)) model.add(Activation('sigmoid')) # compile this model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model ",No,5,84.0 "y_value = train_df['Cover_Type'] del train_df['Cover_Type'], train_df['ID'] del test_df['Cover_Type'], test_df['ID'] model = keras_model(train_df.shape[1]) callbacks = [ EarlyStopping( patience=10, verbose=10) ]",No,4,4.0 """"""" CV .
NFOLD = 5
folds = StratifiedKFold(n_splits= NFOLD, shuffle=True, random_state=2018)

total_score = 0
best_epoch = 0
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df, y_value)):
train_x, train_y = train_df.iloc[train_idx], y_value.iloc[train_idx]
valid_x, valid_y = train_df.iloc[valid_idx], y_value.iloc[valid_idx]

history = model.fit(train_x.values, train_y.values, nb_epoch=30, batch_size = 64, validation_data=(valid_x.values, valid_y.values),
verbose=1, callbacks=callbacks)

keras_history_plot(history)
predict = model.predict(valid_x.values)
null_count = np.sum(pd.isnull(predict) )
if null_count > 0:
print(""Null Prediction Error: "", null_count)
predict[pd.isnull(predict)] = predict[~pd.isnull(predict)].mean()

cv_score = log_loss(valid_y, predict )
total_score += cv_score
best_epoch = max(best_epoch, np.max(history.epoch))
print(\'Fold {} LogLoss : {}\'.format(n_fold + 1, cv_score ))

print(""Best Epoch: "", best_epoch)
print(""Total LogLoss"", total_score/NFOLD)
print(""Baseline model Score Diff"", total_score/NFOLD - baseline_neuralnetwork_score)
""""""",Yes,3,7.0 "history = model.fit(train_df.values, y_value.values, nb_epoch=30, batch_size = 64, verbose=1) predict = model.predict(test_df.values)",Yes,3,7.0 "submission_nn = pd.read_csv('../input/kaggletutorial/sample_submission.csv') submission_nn['Cover_Type'] = predict submission_nn.to_csv('nn_last.csv', index=False)",No,4,25.0 "source = submission.copy() source = source.merge(submission_nn,on='ID') source",No,5,32.0 "lgbm_param1 = {
\'boosting_type\': \'dart\',
\'objective\': \'binary\',
\'metric\': \'binary_logloss\',
""learning_rate"": 0.03,
""num_leaves"": 31,
""max_depth"": 7,
""colsample_bytree"": 0.8,
""subsample"": 0.8,
""reg_alpha"": 0.1,
""reg_lambda"": 0.1,
""nthread"":8,
\'drop_rate\':0.1,
\'skip_drop\':0.5,
\'max_drop\':50,
\'top_rate\':0.1,
\'other_rate\':0.1
}

lgbm_param2 = {
\'boosting_type\': \'gbdt\',
\'objective\': \'binary\',
\'metric\': \'binary_logloss\',
""learning_rate"": 0.03,
""num_leaves"": 10,
""max_depth"": 4,
""colsample_bytree"": 0.5,
""subsample"": 0.8,
""reg_alpha"": 0.1,
""reg_lambda"": 0.1,
""nthread"":8
}

lgbm_param3 = {
\'boosting_type\': \'gbdt\',
\'objective\': \'binary\',
\'metric\': \'binary_logloss\',
""learning_rate"": 0.03,
""num_leaves"": 24,
""max_depth"": 6,
""colsample_bytree"": 0.5,
""subsample"": 0.8,
""reg_alpha"": 0.1,
""reg_lambda"": 0.1,
""nthread"":8
}

rf_params = {
\'criterion\':\'gini\', \'max_leaf_nodes\':24, \'n_estimators\':200, \'min_impurity_split\':0.0000001,
\'max_features\':0.4, \'max_depth\':6, \'min_samples_leaf\':20, \'min_samples_split\':2,
\'min_weight_fraction_leaf\':0.0, \'bootstrap\':True,
\'random_state\':1, \'verbose\':False

}

et_parmas = {
\'criterion\':\'gini\', \'max_leaf_nodes\':31, \'n_estimators\':200, \'min_impurity_split\':0.0000001,
\'max_features\':0.6, \'max_depth\':10, \'min_samples_leaf\':20, \'min_samples_split\':2,
\'min_weight_fraction_leaf\':0.0, \'bootstrap\':True,
\'random_state\':1, \'verbose\':False
}",No,5,59.0 "et_model = SklearnWrapper(clf = ExtraTreesClassifier, params=et_parmas) rf_model = SklearnWrapper(clf = RandomForestClassifier, params=rf_params)",No,5,4.0 "x_train = pd.DataFrame(x_train_second_layer) x_test = pd.DataFrame(x_test_second_layer)",No,5,12.0 "submission_stacking = pd.read_csv('../input/kaggletutorial/sample_submission.csv') submission_stacking['Cover_Type'] = predict_stacking submission_stacking.to_csv('submission_stacking.csv', index=False)",No,4,25.0 "submission_et = pd.read_csv('../input/kaggletutorial/sample_submission.csv') submission_et['Cover_Type'] = et_test submission_et.to_csv('submission_et.csv', index=False)",No,4,25.0 "import cv2 # working with, mainly resizing, images
import numpy as np # dealing with arrays
import os # dealing with directories
from random import shuffle # mixing up or currently ordered data that might lead our network astray in training.
from tqdm import tqdm # a nice pretty percentage bar for tasks. Thanks to viewer Daniel Bhler for this suggestion

TRAIN_DIR = '../input/train'
TEST_DIR = '../input/test'
IMG_SIZE = 50
LR = 1e-3

MODEL_NAME = 'dogsvscats-{}-{}.model'.format(LR, '2conv-basic') # just so we remember which saved model is which, sizes must match",Yes,4,22.0 "train_X = df_train[feature_cols] train_y = df_train.loc[:, 'Y'] df_test = df_test[feature_cols]",No,5,21.0 "rf = RandomForestClassifier(n_estimators=200,max_features='auto',max_depth=23)",No,5,4.0 "train_data = create_train_data() # If you have already created the dataset: #train_data = np.load('train_data.npy')",No,5,53.0 "rf.fit(train_X, train_y)",No,5,7.0 "import tflearn from tflearn.layers.conv import conv_2d, max_pool_2d from tflearn.layers.core import input_data, dropout, fully_connected from tflearn.layers.estimator import regression convnet = input_data(shape=[None, IMG_SIZE, IMG_SIZE, 1], name='input') convnet = conv_2d(convnet, 32, 5, activation='relu') convnet = max_pool_2d(convnet, 5) convnet = conv_2d(convnet, 64, 5, activation='relu') convnet = max_pool_2d(convnet, 5) convnet = fully_connected(convnet, 1024, activation='relu') convnet = dropout(convnet, 0.8) convnet = fully_connected(convnet, 2, activation='softmax') convnet = regression(convnet, optimizer='adam', learning_rate=LR, loss='categorical_crossentropy', name='targets') model = tflearn.DNN(convnet, tensorboard_dir='log')",Yes,4,4.0 " pred = rf.predict_proba(df_test)",No,5,48.0 "if os.path.exists('{}.meta'.format(MODEL_NAME)): model.load(MODEL_NAME) print('model loaded!')",No,5,30.0 "result = pd.DataFrame(pred[:,1]) result.index.name = 'id' result.columns = ['predicted_val'] result.to_csv('output.csv', index=True)",Yes,4,25.0 "train = train_data[:-500] test = train_data[-500:]",No,4,77.0 "import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import roc_auc_score from sklearn.ensemble import ExtraTreesClassifier from sklearn.tree import DecisionTreeClassifier import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split ",No,5,22.0 "df_train = pd.read_csv('../input/web-club-recruitment-2018/train.csv') df_test = pd.read_csv('../input/web-club-recruitment-2018/test.csv') ",No,5,45.0 "X = np.array([i[0] for i in train]).reshape(-1,IMG_SIZE,IMG_SIZE,1) Y = [i[1] for i in train] test_x = np.array([i[0] for i in test]).reshape(-1,IMG_SIZE,IMG_SIZE,1) test_y = [i[1] for i in test]",No,5,21.0 "model.fit({'input': X}, {'targets': Y}, n_epoch=2, validation_set=({'input': test_x}, {'targets': test_y}), snapshot_step=50000, show_metric=True, run_id=MODEL_NAME)",No,5,7.0 "X = df_train.loc[:, 'X1':'X23'] y = df_train.loc[:, 'Y'] ",No,5,14.0 "import tensorflow as tf tf.reset_default_graph()",No,5,23.0 "rf.fit(X, y) ",No,5,7.0 "convnet = input_data(shape=[None, IMG_SIZE, IMG_SIZE, 1], name='input') convnet = conv_2d(convnet, 32, 5, activation='relu') convnet = max_pool_2d(convnet, 5) convnet = conv_2d(convnet, 64, 5, activation='relu') convnet = max_pool_2d(convnet, 5) convnet = conv_2d(convnet, 32, 5, activation='relu') convnet = max_pool_2d(convnet, 5) convnet = conv_2d(convnet, 64, 5, activation='relu') convnet = max_pool_2d(convnet, 5) convnet = conv_2d(convnet, 32, 5, activation='relu') convnet = max_pool_2d(convnet, 5) convnet = conv_2d(convnet, 64, 5, activation='relu') convnet = max_pool_2d(convnet, 5) convnet = fully_connected(convnet, 1024, activation='relu') convnet = dropout(convnet, 0.8) convnet = fully_connected(convnet, 2, activation='softmax') convnet = regression(convnet, optimizer='adam', learning_rate=LR, loss='categorical_crossentropy', name='targets') model = tflearn.DNN(convnet, tensorboard_dir='log') if os.path.exists('{}.meta'.format(MODEL_NAME)): model.load(MODEL_NAME) print('model loaded!') train = train_data[:-500] test = train_data[-500:] X = np.array([i[0] for i in train]).reshape(-1,IMG_SIZE,IMG_SIZE,1) Y = [i[1] for i in train] test_x = np.array([i[0] for i in test]).reshape(-1,IMG_SIZE,IMG_SIZE,1) test_y = [i[1] for i in test] model.fit({'input': X}, {'targets': Y}, n_epoch=4, validation_set=({'input': test_x}, {'targets': test_y}), snapshot_step=500, show_metric=True, run_id=MODEL_NAME)",No,2,4.0 model.save(MODEL_NAME),No,5,50.0 "test = df_test.loc[:, 'X1':'X23'] pred = rf.predict_proba(test)",Yes,4,48.0 "with open('submission_file.csv','w') as f:
f.write('id,label\
')

with open('submission_file.csv','a') as f:
for data in tqdm(test_data):
img_num = data[1]
img_data = data[0]
orig = img_data
data = img_data.reshape(IMG_SIZE,IMG_SIZE,1)
model_out = model.predict([data])[0]
f.write('{},{}\
'.format(img_num,model_out[1]))",No,5,25.0 "import pandas as pd import matplotlib.pyplot as plt import sklearn import os %matplotlib inline print(os.listdir(""../input/dataset-adult/""))",No,5,88.0 "#Adult Data adult = pd.read_csv(""../input/dataset-adult/train_data.csv"",sep="","", na_values=""?"")",No,5,45.0 adult.shape,No,5,58.0 "import pandas as pd import sklearn",No,5,22.0 adult.head(3),No,5,41.0 "import os print(os.listdir('../input'))",No,5,88.0 adult.info(),No,5,40.0 "adult = pd.read_csv(""../input/dataadult/train_data.csv"",
sep=r\'\\s*,\\s*\',
engine=\'python\',
na_values=""?"")",No,5,45.0 adult.describe(),No,5,40.0 adult.head(),No,5,41.0 adult['native.country'].value_counts(),No,5,72.0 "adult['race'].value_counts().plot(kind=""pie"")",No,5,33.0 "#distribuio de idade
adult[""age""].plot(kind=\'hist\',bins=15);",No,5,33.0 "#agrupando atributo \'income\' e \'sex\' para cada idade
df=adult.groupby([""income"",""sex""]).mean()
df[\'age\'].plot(kind=""bar"")",No,5,33.0 nadult = adult.dropna(),No,5,17.0 "#Proporo de sexo por \'income\'
df2=adult.groupby([""income"",""sex""]).size().unstack().plot(kind=\'bar\',stacked=False)",No,5,33.0 nadult,No,5,41.0 "#proporo de sexo por ocupao!
df2=adult.groupby([""occupation"",""sex""])[\'race\'].size().unstack().plot(kind=\'barh\',stacked=True)",No,5,33.0 "#drop colunas empty e index ""Id"" na_adult=adult.set_index(""Id"").dropna()",No,5,17.0 "test_adult= pd.read_csv(""../input/dataset-adult/test_data.csv"",sep="","",na_values=""?"")",No,5,45.0 Yadult = nadult.income,No,4,77.0 from sklearn.neighbors import KNeighborsClassifier,No,5,22.0 knn = KNeighborsClassifier(n_neighbors=5),No,5,4.0 from sklearn.model_selection import cross_val_score,No,5,22.0 "#armazena todos os dados de treino e de teste (numericos e categoricos) X_adult = na_adult.iloc[:,:-1] Y_adult = na_adult.income",No,5,21.0 "X_test = test_adult.iloc[:,:]",No,5,21.0 "#treino e teste apenas de dados numericos adult num_cols=[""age"",""education.num"",""capital.gain"",""capital.loss"",""hours.per.week""] X_num=X_adult[num_cols] Y_num=Y_adult X_test= X_test[num_cols]",No,5,21.0 "knn.fit(Xadult, Yadult)",No,5,7.0 "testadult = pd.read_csv(""../input/dataadult/test_data.csv"",
sep=r\'\\s*,\\s*\',
engine=\'python\',
na_values=""?"")",No,5,45.0 "#importacao de bibliotecas de ML from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import cross_val_score from sklearn.metrics import accuracy_score from sklearn.preprocessing import LabelEncoder",No,5,22.0 YtestPred = knn.predict(Xtestadult),No,5,48.0 "arr1= testadult.iloc[:,0].values
arr1 = arr1.ravel()
dataset = pd.DataFrame({\'Id\':arr1[:],\'income\':YtestPred[:]})
dataset.to_csv(""Adultscompetition.csv"", index = False)",Yes,3,25.0 "import pandas as pd import sklearn",No,5,22.0 "adult = pd.read_csv(""../input/adult-db/train_data.csv"",header=0, index_col=0, na_values=""?"")",No,5,45.0 print(adult.shape),No,5,58.0 "#learning e predict knn=KNeighborsClassifier(n_neighbors=9) #instacia model scores = cross_val_score(knn,X_num,Y_num,cv=10) #validacao cruzada knn.fit(X_num,Y_num) Y_testpredict=knn.predict(X_test) scores",Yes,3,4.0 "#observando tipo de dados.. test_adult.dtypes",No,5,70.0 "#Converte 'object columns para 'str', pois object pode conter dados em outro formato: convert_cols=['workclass','education','marital.status','occupation','race','relationship', 'sex','native.country'] test_adult[convert_cols] = test_adult[convert_cols].astype(str)",No,5,16.0 "#testes.. test_adult.columns",No,5,71.0 "#treino e teste de dados numericos e categoricos: Xencode_adult= na_adult.iloc[:,:-1].apply(LabelEncoder().fit_transform) Xencode_test_adult = test_adult.apply(LabelEncoder().fit_transform) X_adult = Xencode_adult X_test = Xencode_test_adult",No,5,20.0 "adult[""native.country""].value_counts()",No,5,72.0 "import matplotlib.pyplot as plt %matplotlib inline",No,5,23.0 "Yfit_adult= LabelEncoder().fit(na_adult[""income""]) Y_adult = Yfit_adult.transform(na_adult[""income""])",No,5,20.0 "adult[""age""].value_counts().plot(kind=""bar"")",No,5,33.0 "#learning e predict knn =KNeighborsClassifier(n_neighbors=10) scores = cross_val_score(knn,X_adult,Y_adult,cv=10) knn.fit(X_adult,Y_adult) scores",Yes,3,4.0 "adult[""sex""].value_counts()",No,5,72.0 "adult[""education.num""].value_counts().plot(kind=""bar"")",No,5,33.0 "Ytest_predict= knn.predict(X_test) print(Ytest_predict)",No,5,48.0 "adult[""occupation""].value_counts().plot(kind=""bar"")",No,5,33.0 "pd.unique(adult[""relationship""])",No,5,57.0 X_adult.columns,No,5,71.0 "#escolha de atributos para melhor predict atributos=atributos=[""age"",""workclass"",""education.num"",""occupation"",""sex"",""marital.status"",""capital.gain"",""capital.loss""] X_adult = Xencode_adult[atributos] X_test = Xencode_test_adult[atributos]",No,4,21.0 "#Escolhendo k=27 p kNN knn =KNeighborsClassifier(n_neighbors=27) knn.fit(X_adult,Y_adult) scores",Yes,3,4.0 "from sklearn.preprocessing import MinMaxScaler minmaxscaler = MinMaxScaler() col_inds = [0,1,4,5,6,7,8,10,11,12] # 0,1 [0,1,3,4,5,6,7,8,9,10,13] 2 [0,1,3,4,5,6,7,8,9,10,11,12,13] 3 [0,1,4,5,6,7,8,10,11,12] Xadult_unscaled = adult_fill.iloc[:,col_inds].apply(LabelEncoder().fit_transform) Xadult = minmaxscaler.fit_transform(Xadult_unscaled) Yadult = adult_fill.income print(Xadult_unscaled.columns.values)",Yes,3,18.0 Ytest_predict= knn.predict(X_test),No,5,48.0 "#dados de submissao label_out = Yfit_adult.inverse_transform(Ytest_predict) df_out = pd.DataFrame({'Id': X_test.index,'income':label_out}) df_out.to_csv('submission_adult.csv',index=False)",No,5,25.0 "pd.read_csv(""submission_adult.csv"")",No,5,45.0 "%matplotlib inline import pandas as pd import sklearn import matplotlib.pyplot as plt import numpy as np",No,5,23.0 "testAdult = pd.read_csv(""../input/adult-db/test_data.csv"",header=0, index_col=0, na_values=""?"") testAdult.shape",Yes,4,45.0 "adult = pd.read_csv(""../input/mydata/train_data.csv"",
names=[
""Age"", ""Workclass"", ""fnlwgt"", ""Education"", ""Education-Num"", ""Martial Status"",
""Occupation"", ""Relationship"", ""Race"", ""Sex"", ""Capital Gain"", ""Capital Loss"",
""Hours per week"", ""Country"", ""Target""],
skiprows=1,
sep=r\'\\s*,\\s*\',
engine=\'python\',
na_values=""?"")",No,5,45.0 "adult[""Country""].value_counts()",No,5,72.0 "adult[""Age""].value_counts().plot(kind=""bar"")",No,5,33.0 "adult[""Sex""].value_counts().plot(kind=""bar"")",No,5,33.0 "adult[""Occupation""].value_counts().plot(kind=""bar"")",No,5,33.0 "XtestAdult_unscaled = testAdult_fill.iloc[:,col_inds].apply(LabelEncoder().fit_transform) XtestAdult = minmaxscaler.transform(XtestAdult_unscaled)",Yes,3,20.0 "testAdult = pd.read_csv(""../input/mydata/test_data.csv"",
names=[
""ID"",""Age"", ""Workclass"", ""fnlwgt"", ""Education"", ""Education-Num"", ""Martial Status"",
""Occupation"", ""Relationship"", ""Race"", ""Sex"", ""Capital Gain"", ""Capital Loss"",
""Hours per week"", ""Country""],
skiprows=1,
index_col=0,
sep=r\'\\s*,\\s*\',
engine=\'python\',
na_values=""?"")",No,5,45.0 "knn = KNeighborsClassifier(n_neighbors=34,p=1) knn.fit(Xadult,Yadult)",No,5,7.0 testAdult.head(),No,5,41.0 "YtestAdult = knn.predict(XtestAdult) YtestAdult",No,5,48.0 testAdult['Capital Gain'].plot(),No,5,33.0 "prediction = pd.DataFrame(testAdult.index) prediction[""income""] = YtestAdult",Yes,4,12.0 "prediction.to_csv(""adult_prediction_5.csv"", index=False)",No,5,25.0 "import pandas as pd import numpy as np import sklearn",No,5,22.0 "adult = pd.read_csv(""../input/adultdataset/train_data.csv"",
names=[
""Age"", ""Workclass"", ""fnlwgt"", ""Education"", ""Education-Num"", ""Martial Status"",
""Occupation"", ""Relationship"", ""Race"", ""Sex"", ""Capital Gain"", ""Capital Loss"",
""Hours per week"", ""Country"", ""Target""],
sep=r\'\\s*,\\s*\',
engine=\'python\',
skiprows=1,
na_values=""?"")",No,5,45.0 "nTestAdult = testAdult.dropna() nTestAdult.shape",Yes,3,17.0 "Xadult = nadult[[""Age"",""Education-Num"",""Capital Gain"", ""Capital Loss"", ""Hours per week""]] Xadult.head()",No,4,41.0 Yadult = nadult.Target,No,5,21.0 "testAdult = pd.read_csv(""../input/adultdataset/test_data.csv"",
names=[
""id"", ""Age"", ""Workclass"", ""fnlwgt"", ""Education"", ""Education-Num"", ""Martial Status"",
""Occupation"", ""Relationship"", ""Race"", ""Sex"", ""Capital Gain"", ""Capital Loss"",
""Hours per week"", ""Country""],
sep=r\'\\s*,\\s*\',
engine=\'python\',
skiprows=1,
na_values=""?"")",No,5,45.0 from sklearn.neighbors import KNeighborsClassifier,No,5,22.0 knn = KNeighborsClassifier(n_neighbors=30),No,5,4.0 YtestPred = knn.predict(XtestAdult),No,5,48.0 "result = np.vstack((testAdult[""id""], YtestPred)).T x = [""id"",""income""] Resultado = pd.DataFrame(columns = x, data = result) Resultado.to_csv(""Resultados.csv"", index = False) Resultado",Yes,3,25.0 "import os os.listdir('../input/adultb')",No,5,88.0 "adult=pd.read_csv(\'../input/adultb/train_data.csv\',
sep=\',\', engine=\'python\',
na_values=""?"")",No,5,45.0 nadult = adult.copy(),No,4,77.0 adult.isnull().sum(),No,4,39.0 "Xadult = adult[['age','education.num', 'capital.gain','capital.loss', 'hours.per.week']]",No,5,21.0 "testAdult = pd.read_csv('../input/adultb/test_data.csv', sep=',',engine='python', na_values='?')",No,5,45.0 testAdult.isnull().sum(),No,4,39.0 "XtestAdult = testAdult[['age','education.num', 'capital.gain','capital.loss', 'hours.per.week']]",No,5,10.0 "result = np.vstack((testAdult[""Id""], YtestPred)).T x = [""id"",""income""] submit = pd.DataFrame(columns = x, data = result) submit.to_csv(""Resultados.csv"", index = False)",Yes,3,25.0 "import pandas as pd import sklearn import numpy as np import os from sklearn import preprocessing",No,5,22.0 "adultOriginal = pd.read_csv(""../input/adult-db/train_data.csv"",
sep=r\'\\s*,\\s*\',
engine=\'python\',
na_values=""?"")",No,5,45.0 "adultOriginal.head() ",No,5,41.0 adultOriginal.shape,No,5,58.0 "numAdult_1=adultOriginal.fillna(method='pad') numAdult_2=numAdult_1.fillna(method='pad')",No,5,17.0 "adult = numAdult_2.apply(preprocessing.LabelEncoder().fit_transform) adult",No,5,20.0 "adult[""sex""].value_counts().plot(kind=""bar"")",No,5,33.0 "adult[""race""].value_counts().plot(kind=""bar"")",No,5,33.0 "adult[""education""].value_counts().plot(kind=""bar"")",No,5,33.0 "test_adult = pd.read_csv(\'../input/adult-db/test_data.csv\',
sep=r\'\\s*,\\s*\',
engine=\'python\',
na_values=""?"")",No,5,45.0 "numAdultTest_1=test_adult.fillna(method='pad') numAdultTest_2=numAdultTest_1.fillna(method='pad') adultTest = numAdultTest_2.apply(preprocessing.LabelEncoder().fit_transform) adultTest",Yes,4,17.0 "from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import cross_val_score",No,5,22.0 "Xadult = adult.drop(['workclass', 'marital.status', 'sex', 'occupation', 'relationship', 'income', 'capital.gain', 'capital.loss', 'native.country'], axis=1) Xadult",No,5,10.0 "XtestAdult = adultTest.drop(['workclass', 'marital.status', 'sex', 'occupation', 'relationship', 'capital.gain', 'capital.loss', 'native.country'], axis=1)",No,5,10.0 knn = KNeighborsClassifier(n_neighbors=21),No,5,4.0 data = pd.DataFrame(adultTest.Id),No,5,12.0 "data.to_csv(""BaseAdult_KNN.csv"", index=False)",No,5,25.0 "import pandas as pd import sklearn import matplotlib.pyplot as plt import numpy as np",No,5,22.0 "train = pd.read_csv(""../input/dataset/train_data.csv"",
na_values = \'?\')",No,5,45.0 train = train.dropna(),No,5,17.0 "Atrain = train[[""age"",""education.num"",""capital.gain"", ""capital.loss"", ""hours.per.week""]] Btrain = train.income",No,5,21.0 knn = KNeighborsClassifier(n_neighbors=15),No,5,4.0 "knn.fit(Atrain,Btrain)",No,5,7.0 "test = pd.read_csv(""../input/dataset/test_data.csv"",
na_values = \'?\')",No,5,45.0 "Atest = test[[""age"",""education.num"",""capital.gain"", ""capital.loss"", ""hours.per.week""]]",No,5,21.0 Bpred=knn.predict(Atest),No,5,48.0 prediction = pd.DataFrame(index = test.index),No,5,12.0 "prediction.to_csv(""submition.csv"",index=False)",No,5,25.0 "import os, cv2, re, random import numpy as np import pandas as pd from keras.preprocessing.image import ImageDataGenerator from keras.preprocessing.image import img_to_array, load_img from keras import layers, models, optimizers from keras import backend as K from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix import itertools import matplotlib.pyplot as plt import matplotlib.image as mpimg import seaborn as sns %matplotlib inline",No,5,23.0 "transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) trainset = torchvision.datasets.CIFAR10(root='.', train=True, download=False, transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True, num_workers=2) testset = torchvision.datasets.CIFAR10(root='.', train=False, download=False, transform=transform) testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=False, num_workers=2) classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')",No,3,42.0 "def imshow(img): img = img / 2 + 0.5 # unnormalize npimg = img.numpy() plt.imshow(np.transpose(npimg, (1, 2, 0))) # get some random training images dataiter = iter(trainloader) images, labels = dataiter.next() # show images imshow(torchvision.utils.make_grid(images)) # print labels print(' '.join('%5s' % classes[labels[j]] for j in range(32)))",No,5,84.0 "class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(3, 6, 5) self.pool = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(6, 16, 5) self.fc1 = nn.Linear(16 * 5 * 5, 120) self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 10) def forward(self, x): x = self.pool(F.relu(self.conv1(x))) x = self.pool(F.relu(self.conv2(x))) x = x.view(-1, 16 * 5 * 5) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x net = Net() if torch.cuda.is_available(): net.cuda()",No,5,4.0 "import torch.optim as optim criterion = nn.CrossEntropyLoss() if use_gpu: criterion = criterion.cuda() optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)",No,3,28.0 "# loop over the dataset multiple times for epoch in tqdm_notebook(range(10)): running_loss = 0.0 for i, data in tqdm_notebook(enumerate(trainloader, 0)): # get the inputs inputs, labels = data if torch.cuda.is_available(): # in versions of Torch < 0.4.0 we have to wrap these into torch.autograd.Variable as well inputs, labels = Variable(inputs).cuda(), Variable(labels).cuda() # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.data[0] if i % 2000 == 1999: # print every 2000 mini-batches print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000)) running_loss = 0.0 print('Finished Training')",No,5,7.0 "dataiter = iter(testloader) images, labels = dataiter.next() # print images imshow(torchvision.utils.make_grid(images)) print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(32)))",No,5,84.0 "# in PyTorch 0.4.0 you won't need the Variable wrapper outputs = net(Variable(images).cuda()) if use_gpu else net(Variable(images))",No,5,77.0 "_, predicted = torch.max(outputs.data, 1) print('Predicted: ', ' '.join('%5s' % classes[predicted[j]] for j in range(32)))",No,5,53.0 "all_pred = np.empty((0, 10), float)",No,5,53.0 "for data in tqdm_notebook(testloader): images, _ = data if use_gpu: images = images.cuda() outputs = net(Variable(images)) curr_pred = F.softmax(outputs).data.cpu().numpy() all_pred = np.vstack([all_pred, curr_pred])",No,5,48.0 all_pred.shape,No,5,58.0 "pd.DataFrame(all_pred, columns=classes).to_csv('baseline.csv', index_label='id')",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))
import glob
from glob import glob

# Any results you write to the current directory are saved as output.",No,5,88.0 "train_path = '../input/train' path_name = train_path + '/**/*.jpg'",No,4,77.0 "train_image_paths = glob(path_name, recursive=True)",No,5,88.0 train_image_paths[:10],No,4,88.0 "train_categories = list(map(os.path.basename,train_image_paths))",No,3,88.0 train_categories[:3],No,5,53.0 "labels =[] for category in train_categories: labels.append(category[:3])",No,3,21.0 labels[:10],No,5,53.0 len(labels),No,5,58.0 len(train_image_paths),No,5,53.0 "num_classes = len(np.unique(labels)) num_classes",No,5,54.0 "#Encode labels with value between 0 and n_classes-1. from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() loadedLabels = np.asarray(labels) encoder.fit(loadedLabels) encoded_loadedLabels = encoder.transform(loadedLabels)",No,5,20.0 "# Encode labels to hot vectors (ex : 2 -> [0,0,1,0,0,0,0,0,0,0]) from keras.utils.np_utils import to_categorical labels_Hot = to_categorical(encoded_loadedLabels, num_classes = num_classes)",No,5,20.0 labels_Hot[:3],No,5,53.0 df = pd.DataFrame(),No,5,12.0 df,No,5,41.0 df['path']=train_image_paths,No,4,8.0 df['path'].head(),No,5,41.0 df['labels'] = list(labels_Hot),No,5,8.0 "from keras.preprocessing.image import ImageDataGenerator IMG_SIZE = (128, 128) core_idg = ImageDataGenerator(samplewise_center=True, samplewise_std_normalization=True, horizontal_flip = True, vertical_flip = False, height_shift_range= 0.05, width_shift_range=0.1, rotation_range=5, shear_range = 0.1, fill_mode = 'reflect', zoom_range=0.15)",No,4,31.0 "def flow_from_dataframe(img_data_gen, in_df, path_col, y_col, **dflow_args): base_dir = os.path.dirname(in_df[path_col].values[0]) print('## Ignore next message from keras, values are replaced anyways') df_gen = img_data_gen.flow_from_directory(base_dir, class_mode = 'sparse', **dflow_args) df_gen.filenames = in_df[path_col].values df_gen.classes = np.stack(in_df[y_col].values) df_gen.samples = in_df.shape[0] df_gen.n = in_df.shape[0] df_gen._set_index_array() df_gen.directory = '' # since we have the full path print('Reinserting dataframe: {} images'.format(in_df.shape[0])) return df_gen",No,3,21.0 "from sklearn.model_selection import train_test_split train_df, valid_df = train_test_split(df, test_size = 0.25, random_state = 2018)",No,5,13.0 len(train_df),No,5,58.0 len(valid_df),No,5,58.0 "train_gen = flow_from_dataframe(core_idg, train_df, path_col = 'path', y_col = 'labels', target_size = IMG_SIZE, batch_size = 32) valid_gen = flow_from_dataframe(core_idg, valid_df, path_col = 'path', y_col = 'labels', target_size = IMG_SIZE, batch_size = 256) # we can use much larger batches for evaluation # used a fixed dataset for evaluating the algorithm test_X, test_Y = next(flow_from_dataframe(core_idg, valid_df, path_col = 'path', y_col = 'labels', target_size = IMG_SIZE, batch_size = 1024)) # one big batch",No,4,21.0 "t_x, t_y = next(train_gen)",No,5,84.0 t_x.shape[1:],No,5,58.0 "from keras.applications import VGG16 from keras.applications.vgg16 import preprocess_input import keras from keras import backend as K from keras.callbacks import Callback, EarlyStopping, ReduceLROnPlateau, ModelCheckpoint from keras.preprocessing.image import ImageDataGenerator from keras.utils.np_utils import to_categorical from keras.models import Model,Sequential, model_from_json from keras.optimizers import SGD, RMSprop, Adam, Adagrad, Adadelta from keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization, Conv2D, MaxPool2D, MaxPooling2D %matplotlib inline",No,5,23.0 "pretrained_model_1 = VGG16(include_top=False, input_shape=t_x.shape[1:]) base_model = pretrained_model_1 # Topless optimizer1 = keras.optimizers.Adam() # Add top layer x = base_model.output x = Conv2D(100, kernel_size = (3,3), padding = 'valid')(x) x = Flatten()(x) x = Dropout(0.75)(x) predictions = Dense(num_classes, activation='softmax')(x) model = Model(inputs=base_model.input, outputs=predictions) # Train top layer for layer in base_model.layers: layer.trainable = False model.compile(loss='categorical_crossentropy', optimizer=optimizer1, metrics=['accuracy']) model.summary()",Yes,4,30.0 "model.fit_generator(train_gen,steps_per_epoch=100,validation_data = (test_X, test_Y), epochs = 10)",No,5,7.0 "test_image_paths = glob('../input/test/*.jpg', recursive=True)",No,5,88.0 "img_width = 150 img_height = 150 TRAIN_DIR = '../input/train/' TEST_DIR = '../input/test/' train_images_dogs_cats = [TRAIN_DIR+i for i in os.listdir(TRAIN_DIR)] # use this for full dataset test_images_dogs_cats = [TEST_DIR+i for i in os.listdir(TEST_DIR)]",No,5,77.0 "train_images_dogs_cats.sort(key=natural_keys) train_images_dogs_cats_trim = train_images_dogs_cats[0:1300] train_images_dogs_cats_trim += train_images_dogs_cats[12500:13800] test_images_dogs_cats.sort(key=natural_keys)",Yes,4,9.0 "X, Y = prepare_data(train_images_dogs_cats_trim) # First split the data in two sets, 80% for training, 20% for Val/Test) X_train, X_val, Y_train, Y_val = train_test_split(X,Y, test_size=0.2, random_state=1)",Yes,3,21.0 "model = models.Sequential() model.add(layers.Conv2D(32, (3, 3), input_shape=(img_width, img_height, 3))) model.add(layers.Activation('relu')) model.add(layers.MaxPooling2D(pool_size=(2, 2))) model.add(layers.Conv2D(32, (3, 3))) model.add(layers.Activation('relu')) model.add(layers.MaxPooling2D(pool_size=(2, 2))) model.add(layers.Conv2D(64, (3, 3))) model.add(layers.Activation('relu')) model.add(layers.MaxPooling2D(pool_size=(2, 2))) model.add(layers.Flatten()) model.add(layers.Dense(64)) model.add(layers.Activation('relu')) model.add(layers.Dropout(0.5)) model.add(layers.Dense(1)) model.add(layers.Activation('sigmoid'))",No,5,4.0 "X_test, Y_test = prepare_data(test_images_dogs_cats)",No,5,21.0 "Y_pred = model.predict(np.array(X_val)) #####predict cat | predict dog for i in range(0,5): if Y_pred[i, 0] >= 0.5: print('I am {:.2%} sure this is a Dog'.format(Y_pred[i][0])) else: print('I am {:.2%} sure this is a Cat'.format(1-Y_pred[i][0])) plt.imshow(X_val[i]) plt.show()",Yes,2,48.0 "XtestAdult = nTestAdult[[""Age"",""Education-Num"",""Capital Gain"", ""Capital Loss"", ""Hours per week""]] XtestAdult.head()",No,4,41.0 "# Validating the score on validation data score = model.evaluate_generator(validation_generator) print('Test score:', score[0]) print('Test accuracy:', score[1])",No,5,49.0 knn = KNeighborsClassifier(n_neighbors=3),No,5,4.0 from sklearn.metrics import accuracy_score,No,5,22.0 "UCITest = pd.read_csv(""../input/mydata/adult.test"",
names=[
""Age"", ""Workclass"", ""fnlwgt"", ""Education"", ""Education-Num"", ""Martial Status"",
""Occupation"", ""Relationship"", ""Race"", ""Sex"", ""Capital Gain"", ""Capital Loss"",
""Hours per week"", ""Country"", ""Target""],
sep=r\'\\s*,\\s*\',
engine=\'python\',
na_values=""?"")",No,5,45.0 UCITest.head(),No,5,41.0 nUCITest = UCITest.dropna(),No,5,17.0 nUCITest.head(),No,5,41.0 "XUCITest = nUCITest[[""Age"",""Education-Num"",""Capital Gain"", ""Capital Loss"", ""Hours per week""]]",No,5,77.0 XUCITest.head(),No,5,41.0 "YUCIPred = knn.predict(XUCITest) YUCIPred",No,5,48.0 "accuracy_score(YUCIPred, YUCITest)",No,5,49.0 "knn.fit(Xadult, Yadult)",No,5,7.0 YUCIPred = knn.predict(XUCITest),No,5,48.0 "accuracy_score(YUCITest, YUCIPred)",No,5,49.0 "accuracies = {} for i in range(1, 100): knn = KNeighborsClassifier(n_neighbors=i) knn.fit(Xadult, Yadult) scores = cross_val_score(knn, Xadult, Yadult, cv=10) Ypred = knn.predict(XUCITest) accuracy = accuracy_score(YUCITest,Ypred) accuracies[i] = accuracy print('k={}, accuracy={}, CVmean={}'.format(i, accuracy, scores.mean())) ",Yes,4,49.0 "ks = list(accuracies.keys()) acc = list(accuracies.values()) plt.plot(ks, acc) plt.show()",No,5,33.0 knn = KNeighborsClassifier(n_neighbors=28),No,5,4.0 adult['Sex'] = adult['Sex'].transform(lambda x: 1 if x=='Male' else 0 if x==x else x),No,5,8.0 "predictions = knn.predict(testAdult[[""Age"",""Education-Num"",""Capital Gain"", ""Capital Loss"", ""Hours per week""]])",No,5,48.0 "pretrained_model_1 = VGG16(include_top=False, input_shape=t_x.shape[1:])",No,5,4.0 "from keras import optimizers base_model = pretrained_model_1 # Topless add_model = Sequential() add_model.add(Flatten(input_shape=base_model.output_shape[1:])) add_model.add(Dense(256, activation='relu')) add_model.add(Dense(num_classes, activation='softmax')) model = Model(inputs=base_model.input, outputs=add_model(base_model.output)) model.compile(loss='categorical_crossentropy', optimizer=optimizers.SGD(lr=1e-4, momentum=0.9), metrics=['accuracy']) model.summary()",Yes,4,4.0 X_test = pd.DataFrame(),No,5,12.0 X_test['path'] = test_image_paths,No,5,8.0 "result = np.vstack((testAdult.index.values, predictions)).T x = ['Id','income'] resultado = pd.DataFrame(columns=x, data=result) resultado.set_index('Id', inplace=True)",Yes,3,11.0 X_test['labels'] = X_test['path'].map(lambda x: os.path.splitext(os.path.basename(x))[0]),No,5,8.0 resultado.to_csv('mypredictions.csv'),No,5,25.0 "store = pd.read_csv(""../input/store.csv"") train = pd.read_csv(""../input/train.csv"",parse_dates=[2]) test = pd.read_csv(""../input/test.csv"",parse_dates=[3])",No,5,45.0 store.head(),No,5,41.0 submission = pd.DataFrame(),No,5,12.0 "# check store nan rows store.isnull().sum()",No,5,39.0 store.PromoInterval.value_counts(),No,5,72.0 "submission.to_csv(""predictions.csv"",index=False)",No,5,25.0 "# fillna in store with 0 has better result than median() store.fillna(0, inplace=True)",No,5,17.0 "%matplotlib inline import warnings warnings.filterwarnings('ignore') import os import gc import time import pickle import feather import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from tqdm._tqdm_notebook import tqdm_notebook as tqdm tqdm.pandas() # from tqdm import tqdm # pd.options.display.max_rows = 999 # pd.options.display.max_columns = 999 import glob def get_path(str, first=True, parent_dir='../input/**/'): res_li = glob.glob(parent_dir+str) return res_li[0] if first else res_li",No,4,88.0 train.head().append(train.tail()),No,5,41.0 train.Open.value_counts(),No,5,72.0 "DATA_DIR = '../input/dogs-vs-cats-redux-kernels-edition/' evals = pd.read_csv('../input/dvc-prepare-evalset/evals.csv') evals.head()",Yes,4,45.0 "H, W, C = 224, 224, 3 #at least 197 batch_size = 32 eval_batch_size = batch_size * 4",No,5,77.0 "# draw store 1 and store 10 sales distribution plot
import matplotlib.pyplot as plt
store_1 = train.loc[(train[""Store""]==1)&(train[\'Sales\']>0), [\'Date\',""Sales""]]
store_10 = train.loc[(train[""Store""]==10)&(train[\'Sales\']>0), [\'Date\',""Sales""]]
f = plt.figure(figsize=(18,10))
ax1 = f.add_subplot(211)
ax1.plot(store_1[\'Date\'], store_1[\'Sales\'], \'-\')
ax1.set_xlabel(\'Time\')
ax1.set_ylabel(\'Sales\')
ax1.set_title(\'Store 1 Sales Distribution\')

ax2 = f.add_subplot(212)
ax2.plot(store_10[\'Date\'], store_10[\'Sales\'], \'-\')
ax2.set_xlabel(\'Time\')
ax2.set_ylabel(\'Sales\')
ax2.set_title(\'Store 10 Sales Distribution\')",No,5,75.0 "import keras.backend as K from keras.models import Model from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping from keras import optimizers, losses, activations, models from keras.layers import Conv2D, Dense, Input, Flatten, Concatenate, Dropout, Activation from keras.layers import BatchNormalization, MaxPooling2D, GlobalAveragePooling2D from keras import applications",No,5,22.0 test.isnull().sum(),No,5,39.0 "# check stores open distribution on days of week import seaborn as sns sns.countplot(x = 'DayOfWeek', hue = 'Open', data = test) plt.title('Store Daily Open Countplot')",No,5,75.0 "# fill missing values in test with 1 test.fillna(value = 1, inplace = True)",No,5,17.0 "import seaborn as sns import matplotlib.pyplot as plt # check distribution of sales in train set fig = plt.figure(figsize=(12,5)) ax1 = fig.add_subplot(121) ax2 = fig.add_subplot(122) g1 = sns.distplot(train['Sales'],hist = True,label='skewness:{:.2f}'.format(train['Sales'].skew()),ax = ax1) g1.legend() g1.set(xlabel = 'Sales', ylabel = 'Density', title = 'Sales Distribution') g2 = sns.distplot(np.log1p(train['Sales']),hist = True,label='skewness:{:.2f}'.format(np.log1p(train['Sales']).skew()),ax=ax2) g2.legend() g2.set(xlabel = 'log(Sales+1)',ylabel = 'Density', title = 'log(Sales+1) Distribution') plt.show()",No,5,33.0 "# process train and test
def process(data, isTest = False):
# label encode some features
mappings = {'0':0, 'a':1, 'b':2, 'c':3, 'd':4}
data.StoreType.replace(mappings, inplace=True)
data.Assortment.replace(mappings, inplace=True)
data.StateHoliday.replace(mappings, inplace=True)

# extract some features from date column
data['Month'] = data.Date.dt.month
data['Year'] = data.Date.dt.year
data['Day'] = data.Date.dt.day
data['WeekOfYear'] = data.Date.dt.weekofyear

# calculate competiter open time in months
data['CompetitionOpen'] = 12 * (data.Year - data.CompetitionOpenSinceYear) + \\
(data.Month - data.CompetitionOpenSinceMonth)
data['CompetitionOpen'] = data['CompetitionOpen'].apply(lambda x: x if x > 0 else 0)

# calculate promo2 open time in months
data['PromoOpen'] = 12 * (data.Year - data.Promo2SinceYear) + \\
(data.WeekOfYear - data.Promo2SinceWeek) / 4.0
data['PromoOpen'] = data['PromoOpen'].apply(lambda x: x if x > 0 else 0)

# Indicate whether the month is in promo interval
month2str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', \\
7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
data['month_str'] = data.Month.map(month2str)

def check(row):
if isinstance(row['PromoInterval'],str) and row['month_str'] in row['PromoInterval']:
return 1
else:
return 0

data['IsPromoMonth'] = data.apply(lambda row: check(row),axis=1)

# select the features we need
features = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday',
'StoreType', 'Assortment', 'CompetitionDistance',
'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
'Promo2SinceWeek', 'Promo2SinceYear', 'Year', 'Month', 'Day',
'WeekOfYear', 'CompetitionOpen', 'PromoOpen', 'IsPromoMonth']
if not isTest:
features.append('Sales')

data = data[features]
return data

train = process(train)
valid = process(valid)
train_total = process(train_total)
x_test = process(test,isTest = True) ",No,5,8.0 "train_steps = int(np.ceil(train_flow.n / batch_size)) valid_steps = int(np.ceil(valid_flow.n / eval_batch_size)) test_steps = int(np.ceil(test_flow.n / eval_batch_size))",No,5,77.0 "# try random forest from sklearn.ensemble import RandomForestRegressor clf = RandomForestRegressor(n_estimators = 15) clf.fit(x_train, y_train) # validation y_pred = clf.predict(x_valid) error = rmspe(np.expm1(y_valid), np.expm1(y_pred)) print('RMSPE: {:.4f}'.format(error))",Yes,3,4.0 "eval_res = pd.DataFrame(history.history) eval_res.to_csv('eval_res_init.csv', index=False) for c in ['acc', 'loss']: eval_res[[c, f'val_{c}']].plot(figsize=[18, 4]); plt.xlabel('Epoch'); plt.ylabel(c); plt.title(c); plt.grid();",Yes,3,56.0 "import xgboost as xgb

params = {""objective"": ""reg:linear"", # for linear regression
""booster"" : ""gbtree"", # use tree based models
""eta"": 0.03, # learning rate
""max_depth"": 10, # maximum depth of a tree
""subsample"": 0.9, # Subsample ratio of the training instances
""colsample_bytree"": 0.7, # Subsample ratio of columns when constructing each tree
""silent"": 1, # silent mode
""seed"": 10 # Random number seed
}
num_boost_round = 4000

dtrain = xgb.DMatrix(x_train, y_train)
dvalid = xgb.DMatrix(x_valid, y_valid)
watchlist = [(dtrain, \'train\'), (dvalid, \'eval\')]
# train the xgboost model
model = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \\
early_stopping_rounds= 100, feval=rmspe_xg, verbose_eval=True)",Yes,4,59.0 "# validation y_pred = model.predict(xgb.DMatrix(x_valid)) error = rmspe(np.expm1(y_valid), np.expm1(y_pred)) print('RMSPE: {:.4f}'.format(error)) ",Yes,4,48.0 x_train_total.head().append(x_train_total.tail()),No,5,11.0 "print(x_train_total.shape) print(y_train_total.shape)",No,5,58.0 "dtrain = xgb.DMatrix(x_train_total, y_train_total) dtest = xgb.DMatrix(x_test) # specify parameters via map params = {""objective"": ""reg:linear"", # for linear regression ""booster"" : ""gbtree"", # use tree based models ""eta"": 0.03, # learning rate ""max_depth"": 10, # maximum depth of a tree ""subsample"": 0.9, # Subsample ratio of the training instances ""colsample_bytree"": 0.7, # Subsample ratio of columns when constructing each tree ""silent"": 1, # silent mode ""seed"": 10 # Random number seed } num_round = 3000 model = xgb.train(params, dtrain, num_round) # make prediction preds = model.predict(dtest)",Yes,4,59.0 model.load_weights('model.h5'),No,5,30.0 "DATA_DIR = '../input/dogs-vs-cats-redux-kernels-edition/' evals = pd.read_csv('../input/dvc-prepare-evalset/evals.csv') evals['path'] = evals['path'].apply(lambda x: x.replace('../input/', DATA_DIR)) evals.head()",Yes,4,45.0 "H, W, C = 150, 150, 3 batch_size = 32 eval_batch_size = batch_size * 4",No,5,77.0 "import tensorflow as tf import keras from keras.preprocessing.image import ImageDataGenerator train_gen = ImageDataGenerator( rotation_range=20, #width_shift_range=0.2, #height_shift_range=0.2, shear_range=0.2, zoom_range=0.2, #channel_shift_range=0.2, horizontal_flip=True, #vertical_flip=True, #rescale=1./255,#!!!!NO! preprocessing_function=lambda x:(x-x.mean())/x.std() ) test_gen = ImageDataGenerator( #rescale=1./255, preprocessing_function=lambda x:(x-x.mean())/x.std() )",Yes,3,22.0 "eval_res = pd.DataFrame(history.history) eval_res.to_csv('eval_res_finetune.csv', index=False) for c in ['acc', 'loss']: eval_res[[c, f'val_{c}']].plot(figsize=[18, 4]); plt.xlabel('Epoch'); plt.ylabel(c); plt.title(c); plt.grid();",No,4,56.0 "pred_val.shape, valid_flow.classes.shape",No,5,58.0 "n_final_state = 32 def get_model(n_final_state, lr=1e-3, decay=1e-8): input_shape = (H, W, C) input_x = Input(shape=input_shape) c1 = Conv2D(32, (3, 3))(input_x) c1 = BatchNormalization()(c1) c1 = Activation('relu')(c1) c1 = MaxPooling2D((2, 2))(c1) c2 = Conv2D(32, (3, 3))(c1) c2 = BatchNormalization()(c2) c2 = Activation('relu')(c2) c2 = MaxPooling2D((2, 2))(c2) c3 = Conv2D(64, (3, 3))(c2) c3 = BatchNormalization()(c3) c3 = Activation('relu')(c3) c3 = MaxPooling2D((2, 2))(c3) flat = Flatten()(c3) d1 = Dense( 64, activation='relu' )(flat) #d1 = Dropout(0.5)(d1) d1 = BatchNormalization()(d1) final_state = Dense( n_final_state, activation='relu', name='final_state' )(d1) x = Dropout(0.5)(final_state) outputs = Dense(1, activation='sigmoid')(x) model = Model(inputs=input_x, outputs=outputs) optimizer=optimizers.Adam(lr=lr, decay=decay) model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) return model model = get_model(n_final_state=n_final_state) model.summary()",No,5,4.0 "train_steps = int(np.ceil(train_flow.n / batch_size)) valid_steps = int(np.ceil(valid_flow.n / eval_batch_size)) test_steps = int(np.ceil(test_flow.n / eval_batch_size)) print(f'train {train_steps} steps') print(f'valid {valid_steps} steps') print(f'test {test_steps} steps')",No,5,77.0 "from sklearn.metrics import log_loss, accuracy_score
val_loss = log_loss(y_valid, pred_val)
val_acc = accuracy_score(y_valid, np.round(pred_val))
print(f'valid loss: {val_loss}\\t valid accuracy: {val_acc}')",No,5,49.0 "evals.loc[evals['is_test']==1, 'img_id'].shape",No,5,58.0 "eval_res = pd.DataFrame(history.history) eval_res.to_csv('eval_res.csv', index=False) for c in ['acc', 'loss']: eval_res[[c, f'val_{c}']].plot(figsize=[18, 6]); plt.xlabel('Epoch'); plt.ylabel(c); plt.title(c); plt.grid();",Yes,3,25.0 "subname = f'resnet50ft_{val_loss:.6f}.csv' sub.to_csv(subname, index=False) print(subname, 'saved')",No,5,25.0 "def predict(model, modelpath, data_flow, steps, workers=4, verbose=1): model.load_weights(modelpath) pred = model.predict_generator( generator=data_flow, steps=steps, use_multiprocessing=True if workers>1 else False, workers=workers, verbose=verbose ) return pred",Yes,3,30.0 "train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv')",No,5,45.0 "from sklearn.linear_model import LinearRegression from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.preprocessing import PolynomialFeatures",No,5,22.0 "print(pred_val_best.shape, pred_val_best_tta.shape) sns.distplot(pred_val_best) sns.distplot(pred_val_best_tta) plt.legend(['normal', 'fliplr']); plt.grid();",Yes,4,33.0 "X_train = train[['LotFrontage','LotArea']].fillna(0) X_test = test[['LotFrontage','LotArea']].fillna(0) y_train = train['SalePrice']",Yes,4,17.0 "from sklearn.model_selection import cross_val_score from sklearn.metrics import mean_squared_error",No,5,22.0 "for i,(p, p_tta) in enumerate(zip(pred_val_li, pred_val_tta_li)): print(i+2, 'th snapshot normal loss: {:.6f} acc: {:.6f}'.format( log_loss(y_valid, p), accuracy_score(y_valid, np.round(p)) )) print(i+2, 'th snapshot tta loss: {:.6f} acc: {:.6f}'.format( log_loss(y_valid, p_tta), accuracy_score(y_valid, np.round(p_tta)) ))",No,5,49.0 "X_meta = pred_val_li + pred_val_tta_li X_meta = np.hstack(X_meta) X_meta.shape",Yes,4,11.0 "pipe.fit(X_train, y_train)",No,5,7.0 preds = pipe.predict(X_test),No,5,48.0 "sub = pd.DataFrame({'Id': test.Id, 'SalePrice': preds}) sub.to_csv('submission.csv', index=False)",No,5,25.0 "from sklearn.linear_model import LogisticRegressionCV meta_model = LogisticRegressionCV(scoring='neg_log_loss') meta_model.fit(X_meta, y_valid) print(meta_model.coef_, meta_model.intercept_)",Yes,3,7.0 "train = pd.read_csv(""../input/comp_train.csv"") test = pd.read_csv(""../input/comp_test.csv"") print(train.shape)",Yes,4,45.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline import cv2 import os",No,5,23.0 "# Fixed for our Cats & Dogs classes NUM_CLASSES = 2 # Fixed for Cats & Dogs color images CHANNELS = 3 IMAGE_RESIZE = 224 RESNET50_POOLING_AVERAGE = 'avg' DENSE_LAYER_ACTIVATION = 'softmax' OBJECTIVE_FUNCTION = 'categorical_crossentropy' # Common accuracy metric for all outputs, but can use different metrics for different output LOSS_METRICS = ['accuracy'] # EARLY_STOP_PATIENCE must be < NUM_EPOCHS NUM_EPOCHS = 10 EARLY_STOP_PATIENCE = 3 # These steps value should be proper FACTOR of no.-of-images in train & valid folders respectively # Training images processed in each step would be no.-of-train-images / STEPS_PER_EPOCH_TRAINING STEPS_PER_EPOCH_TRAINING = 10 STEPS_PER_EPOCH_VALIDATION = 10 # These steps value should be proper FACTOR of no.-of-images in train & valid folders respectively # NOTE that these BATCH* are for Keras ImageDataGenerator batching to fill epoch step input BATCH_SIZE_TRAINING = 100 BATCH_SIZE_VALIDATION = 100 # Using 1 to easily manage mapping between test_generator & prediction for submission preparation BATCH_SIZE_TESTING = 1",No,5,77.0 import matplotlib.pyplot as plt,No,5,22.0 "from tensorflow.python.keras.applications import ResNet50 from tensorflow.python.keras.models import Sequential from tensorflow.python.keras.layers import Dense ### ### Below systax is available with TensorFlow 1.11 onwards but this upgrade is not available for Kaggle kernel yet ### #import tensorflow as tf #print(tf.__version__) #import tensorflow as tf #from tf.keras.applications import ResNet50 #from tf.keras.models import Sequential",No,5,22.0 resnet_weights_path = '../input/resnet50/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5',No,5,77.0 "#Still not talking about our train/test data or any pre-processing. model = Sequential() # 1st layer as the lumpsum weights from resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5 # NOTE that this layer will be set below as NOT TRAINABLE, i.e., use it as is model.add(ResNet50(include_top = False, pooling = RESNET50_POOLING_AVERAGE, weights = resnet_weights_path)) # 2nd layer as Dense for 2-class classification, i.e., dog or cat using SoftMax activation model.add(Dense(NUM_CLASSES, activation = DENSE_LAYER_ACTIVATION)) # Say not to train first layer (ResNet) model as it is already trained model.layers[0].trainable = False",No,5,4.0 "plt.scatter(train[""OverallQual""],train[""SalePrice""])",No,5,33.0 "plt.scatter(train[""TotalBsmtSF""],train[""SalePrice""])",No,5,33.0 "plt.scatter(train[""YearBuilt""],train[""SalePrice""])",No,5,33.0 "X_train = train[""TotalBsmtSF""].values y_train = train[""SalePrice""].values ",No,5,21.0 "fit_history = model.fit_generator( train_generator, steps_per_epoch=STEPS_PER_EPOCH_TRAINING, epochs = NUM_EPOCHS, validation_data=validation_generator, validation_steps=STEPS_PER_EPOCH_VALIDATION, callbacks=[cb_checkpointer, cb_early_stopper] ) model.load_weights(""../working/best.hdf5"")",No,4,30.0 "import numpy as np m=train.shape[0] #changing the shape to ,x1 one=np.ones((m,1)) X_train = X_train.reshape((m,1)) y_train = y_train.reshape((m,1)) X1=np.hstack((X_train,one)) ",Yes,3,11.0 " plt.figure(1, figsize = (15,8)) plt.subplot(221) plt.plot(fit_history.history['acc']) plt.plot(fit_history.history['val_acc']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'valid']) plt.subplot(222) plt.plot(fit_history.history['loss']) plt.plot(fit_history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'valid']) plt.show()",No,5,35.0 "TRAIN_PATH = os.path.join(""../input/ai-academy-intermediate-class-competition-1"", ""BBC News Train.csv"") #Load the data using pandas : Create a DataFrame named df, that contains the training data df = pd.read_csv(TRAIN_PATH)",No,5,45.0 "X_test=test.iloc[:,1:].values m_test=X_test.shape[0] one=np.ones((m_test,1)) X_test=np.hstack((X_test,one))",Yes,3,11.0 "# List first 5 entries in dataframe to make sure it was loaded properly # and review the various colums in the dataframe df.head()",No,5,41.0 "# Associate Category names with numerical index and save it in new column category_id df['category_id'] = df['Category'].factorize()[0] #View first 10 entries of category_id, as a sanity check df['category_id'][0:10]",No,5,8.0 "prediction=np.dot(X_test,theta) prediction",No,5,48.0 "# Create a new pandas dataframe ""category_id_df"", which only has unique Categories, also sorting this list in order of category_id values
category_id_df = df[[\'Category\', \'category_id\']].drop_duplicates().sort_values(\'category_id\')",No,5,9.0 "sub=pd.DataFrame()
sub[\'Id\'] = test[\'Id\']
sub[\'SalePrice\']=prediction
sub.to_csv(""prediction.csv"", index = False)",Yes,4,25.0 "# Create a dictionary ( python datastructure - like a lookup table) that # can easily convert category names into category_ids and vice-versa category_to_id = dict(category_id_df.values) id_to_category = dict(category_id_df[['category_id', 'Category']].values)",No,5,77.0 print(prediction.shape),No,5,58.0 "# Pick 5 random samples from the dataframe df.sample(5, random_state=0)",No,5,41.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt import os RANDOM_STATE = 31415",Yes,4,22.0 "# Group the dataframe by categories and count items ( number of news articles) in each category df.groupby('Category').category_id.count() ",No,5,72.0 "#Plot the distribution of news articles by category df.groupby('Category').category_id.count().plot.bar(ylim=0)",No,5,33.0 "from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english') features = tfidf.fit_transform(df.Text).toarray() # Remaps the words in the 1490 articles in the text column of # data frame into features (superset of words) with an importance assigned # based on each words frequency in the document and across documents labels = df.category_id # represents the category of each of the 1490 articles ",No,5,8.0 "#Get a feel of the features identified by tfidf features.shape # How many features are there ? ",No,5,58.0 "# metric to optimize from sklearn.metrics import mean_squared_error from sklearn.metrics import make_scorer scorer = make_scorer(lambda y_test, predictions: np.sqrt(mean_squared_error(y_test, predictions)))",Yes,4,49.0 "colors = [\'pink\', \'green\', \'midnightblue\', \'orange\', \'darkgrey\']

# Find points belonging to each category and plot them
for category, category_id in sorted(category_to_id.items()):
points = projected_features[(labels[indices] == category_id).values]
plt.scatter(points[:, 0], points[:, 1], s=30, c=colors[category_id], label=category)
plt.title(""tf-idf feature vector for each article, projected on 2 dimensions."",
fontdict=dict(fontsize=15))
plt.legend()",No,5,33.0 "training_set = pd.read_csv('../input/train.csv') training_set['datetime'] = training_set['datetime'].apply(lambda x: pd.to_datetime(x).timestamp())",Yes,4,45.0 features.shape,No,5,58.0 training_set.head(),No,5,41.0 "from sklearn.model_selection import train_test_split # Basic preprocessing which applies to all regression techniques (dependent variable: casual) data = training_set.drop(columns = ['registered', 'count']) X_train, X_test, y_train, y_test = train_test_split(data, data.casual, test_size=0.2, random_state = RANDOM_STATE) X_train = X_train.drop(columns = ['casual']) X_test = X_test.drop(columns = ['casual'])",Yes,3,13.0 "from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import RandomizedSearchCV ?np.random.uniform",No,5,84.0 "casual_model.fit(X_train, y_train) casual_model = casual_model.best_estimator_",Yes,4,7.0 "# Same thing for the second variable # Basic preprocessing which applies to all regression techniques (dependent variable: casual) data = training_set.drop(columns = ['casual', 'count']) X_train, X_test, y_train, y_test = train_test_split(data, data.registered, test_size=0.2, random_state = RANDOM_STATE) X_train = X_train.drop(columns = ['registered']) X_test = X_test.drop(columns = ['registered'])",Yes,3,13.0 "registered_model.fit(X_train, y_train) registered_model = registered_model.best_estimator_",Yes,4,7.0 "from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import cross_val_score models = [ RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0), MultinomialNB(), LogisticRegression(random_state=0), ] ",Yes,4,82.0 "# Final prediction of the baseline models, as I am not going to tweak them, I will move directly to the test data

test_dataset = pd.read_csv(""../input/test.csv"")
dates = test_dataset[\'datetime\']
test_dataset[\'datetime\'] = test_dataset[\'datetime\'].apply(lambda x: pd.to_datetime(x).timestamp())",Yes,4,45.0 "CV = 5 # Cross Validate with 5 different folds of 20% data ( 80-20 split with 5 folds ) #Create a data frame that will store the results for all 5 trials of the 3 different models cv_df = pd.DataFrame(index=range(CV * len(models))) entries = [] # Initially all entries are empty",No,4,12.0 "casual = casual_model.predict(test_data) registered = registered_model.predict(test_data) total = casual + registered",No,5,48.0 "#For each Algorithm for model in models: model_name = model.__class__.__name__ # create 5 models with different 20% test sets, and store their accuracies accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV) # Append all 5 accuracies into the entries list ( after all 3 models are run, there will be 3x5 = 15 entries) for fold_idx, accuracy in enumerate(accuracies): entries.append((model_name, fold_idx, accuracy))",No,5,3.0 "# Store the entries into the results dataframe and name its columns cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])",No,5,12.0 "test_dataset['datetime'] = dates test_dataset['count'] = pd.Series(total)",No,5,8.0 "import seaborn as sns

sns.boxplot(x=\'model_name\', y=\'accuracy\', data=cv_df)
sns.stripplot(x=\'model_name\', y=\'accuracy\', data=cv_df,
size=8, jitter=True, edgecolor=""gray"", linewidth=2)",No,5,33.0 test_dataset[test_dataset['count'] < 0],No,5,14.0 "# Mean accuracy of each algorithm cv_df.groupby('model_name').accuracy.mean()",No,5,60.0 "test_dataset.loc[test_dataset['count'] < 0, 'count'] = 0",No,5,8.0 test_dataset[test_dataset['count'] <= 0],No,5,14.0 cv_df,No,5,41.0 "test_dataset[['datetime', 'count']].to_csv('result.csv', index = False)",No,5,25.0 "from sklearn.model_selection import train_test_split model = LogisticRegression(random_state=0) #Split Data X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.33, random_state=0) #Train Algorithm model.fit(X_train, y_train) # Make Predictions y_pred_proba = model.predict_proba(X_test) y_pred = model.predict(X_test)",Yes,3,7.0 "import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt sns.set(color_codes=True) import os print(os.listdir(""../input""))",No,5,88.0 "df = pd.read_csv(""../input/train.tsv"", sep=""\\t"")
df_test = pd.read_csv(""../input/test.tsv"", sep=""\\t"")",No,5,45.0 df.shape,No,5,58.0 "from sklearn.metrics import confusion_matrix import seaborn as sns conf_mat = confusion_matrix(y_test, y_pred) sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels=category_id_df.Category.values, yticklabels=category_id_df.Category.values) plt.ylabel('Actual') plt.xlabel('Predicted')",No,5,80.0 "model.fit(features, labels)",No,5,7.0 "import os print(os.listdir(""../input/bbc-test""))",No,5,88.0 "TEST_PATH = os.path.join(""../input/bbc-test"", ""BBC News Test.csv"") #Load the data using pandas : Create a DataFrame test_df = pd.read_csv(TEST_PATH) ",No,5,45.0 "from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import LinearSVC from sklearn.pipeline import Pipeline from sklearn.model_selection import StratifiedKFold",No,5,22.0 "test_features = tfidf.transform(test_df.Text.tolist()) Y_pred = model.predict(test_features) Y_pred",Yes,3,8.0 "import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import datetime from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import log_loss",No,5,22.0 "#Create Submission Dataframe submission = pd.DataFrame({ ""ArticleId"": test_df[""ArticleId""], ""Category"": Y_pred_name })",No,5,12.0 "# Convert submission dataframe to csv # you could use any filename. We choose submission here submission.to_csv('submission.csv', index=False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input/""))

# Any results you write to the current directory are saved as output.",No,5,88.0 "import pandas as pd import numpy as np import seaborn as sns import missingno as msno import gc",No,5,22.0 "def PrepareFeatures(TestOrTrain):
source = f\'../input/{TestOrTrain}.json\'
data = pd.read_json(source)
# Some noise in price feature we saw in Part 1:
ulimit = np.percentile(data.price.values, 99)
data[\'price\'][data[\'price\']>ulimit] = ulimit
# Get Different features as in Part 1:
data[\'hasDesc\'] = data[\'description\'].apply(lambda x: len(x.strip())!=0)
data[""nFeatures""] = data[""features""].apply(len)
data[""nDescWords""] = data[""description""].apply(lambda x: len(x.split("" "")))
data[\'nPhotos\'] = data[\'photos\'].apply(lambda x: min(10, len(x)))
data[\'created\'] = pd.to_datetime(data[\'created\'])
data[\'month\'] = data[\'created\'].dt.month
data[\'weekday\'] = data[\'created\'].apply(lambda x: x.weekday())
return data

# Using categorical (more sparse) data, we ispected in Part 1:
def CreateCategFeat(data, features_list):
f_dict = {\'hasParking\':[\'parking\', \'garage\'], \'hasGym\':[\'gym\', \'fitness\', \'health club\'],
\'hasPool\':[\'swimming pool\', \'pool\'], \'noFee\':[\'no fee\', ""no broker\'s fees""],
\'hasElevator\':[\'elevator\'], \'hasGarden\':[\'garden\', \'patio\', \'outdoor space\'],
\'isFurnished\': [\'furnished\', \'fully equipped\'],
\'reducedFee\':[\'reduced fee\', \'low fee\'],
\'hasAC\':[\'air conditioning\', \'central a/c\', \'a/c\', \'central air\', \'central ac\'],
\'hasRoof\':[\'roof\', \'sundeck\', \'private deck\', \'deck\'],
\'petFriendly\':[\'pets allowed\', \'pet friendly\', \'dogs allowed\', \'cats allowed\'],
\'shareable\':[\'shares ok\'], \'freeMonth\':[\'month free\'],
\'utilIncluded\':[\'utilities included\']}
for feature in features_list:
data[feature] = False
for ind, row in data.iterrows():
for f in row[\'features\']:
f = f.lower().replace(\'-\', \'\')
if any(e in f for e in f_dict[feature]):
data.at[ind, feature]= True",No,5,8.0 "data = PrepareFeatures(\'train\')
cat_features = [\'hasParking\', \'hasGym\', \'hasPool\', \'noFee\', \'hasElevator\',
\'hasGarden\', \'isFurnished\', \'reducedFee\', \'hasAC\', \'hasRoof\',
\'petFriendly\', \'shareable\', \'freeMonth\', \'utilIncluded\']
CreateCategFeat(data, cat_features)
features = [""bathrooms"", ""bedrooms"", ""latitude"", ""longitude"", ""price"",
""nPhotos"", ""hasDesc"", \'nFeatures\', \'nDescWords\', ""month"", \'weekday\']
features.extend(cat_features)
X = data[features]
y = data[""interest_level""]",No,5,21.0 "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05) clf = RandomForestClassifier(n_estimators=2000) clf.fit(X_train, y_train) y_val_pred = clf.predict_proba(X_val) log_loss(y_val, y_val_pred)",Yes,3,7.0 "test = PrepareFeatures('test') CreateCategFeat(test, cat_features) X = test[features] y = clf.predict_proba(X)",Yes,4,48.0 "labels2idx = {label: i for i, label in enumerate(clf.classes_)} sub = pd.DataFrame() sub[""listing_id""] = test[""listing_id""] for label in [""high"", ""medium"", ""low""]: sub[label] = y[:, labels2idx[label]] sub.to_csv(""submission_rf.csv"", index=False)",Yes,4,25.0 "import math import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline ",Yes,4,22.0 "df = pd.read_csv('../input/training/training.csv') df_test=pd.read_csv('../input/test/test.csv') df.dropna(inplace=True) df.shape",Yes,3,45.0 "y = df.iloc[:, :-1].values y.shape",Yes,4,14.0 "from sklearn.model_selection import train_test_split x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42) x_train.shape, x_val.shape",Yes,4,13.0 "x_train.shape, x_val.shape",No,5,58.0 "# Definir correctamente la red neuronal (5 pts)
from keras.models import Sequential
from keras.layers import GlobalAveragePooling2D, Dense, Flatten,BatchNormalization, Dropout, Conv2D, MaxPool2D
from keras.optimizers import Adam, SGD
from keras import regularizers

lr = 0.01
bs = 256
nb = math.ceil(len(x_train)/bs)

final_model = Sequential([
Conv2D(32, 3, activation='relu', input_shape=(96,96,1)),
MaxPool2D(),
Conv2D(16, 3, activation='relu'),
GlobalAveragePooling2D(),
Dense(256, activation='relu', kernel_initializer='glorot_normal'),
Dropout(0.7),
Dense(128, activation='relu', kernel_initializer='glorot_normal'),
Dense(64, activation='relu', kernel_initializer='glorot_normal'),
Dense(30) #no se utiliza funcin de activacin porque se requiere hacer una regresin para cada coordenada
])
final_model .compile(Adam(lr), loss='mse', metrics=['mae'])
final_model .summary()",Yes,4,4.0 "log = final_model.fit(x_train, y_train, batch_size=100, epochs=100,validation_data=[x_val, y_val])",No,5,7.0 "# Resultado del entrenamiento # - mae entre 10 y 15 (3 pts) # - mae entre 8 y 11 (5 pts) # - mae entre 5 y 8 (7 pts) # - mae menor o igual a 4.0 (9 pts) print(f'MAE final: {final_model.evaluate(x_val, y_val)[1]}')",No,5,49.0 "x_val[0,None].shape",No,5,58.0 "results=final_model.predict(test) results.shape",Yes,4,48.0 "lookup = pd.read_csv('../input/IdLookupTable.csv') ",No,5,45.0 "submission = pd.concat([rowid,loc],axis = 1)",No,5,11.0 "submission.to_csv('submission2.csv', index=False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input/atividade-3-pmr3508""))
print(os.listdir(""../input/distance-to-coast""))
print(os.listdir(""../input/califdata""))
# Any results you write to the current directory are saved as output.",No,5,88.0 "train_data = pd.read_csv(""../input/atividade-3-pmr3508/train.csv"")
test_data = pd.read_csv(""../input/atividade-3-pmr3508/test.csv"")
#Saving the id\'s, just in case they are needed in the submission
train_Id = train_data.loc[:,\'Id\']
test_Id = test_data.loc[:,\'Id\']
test_data = test_data.drop(\'Id\',axis = \'columns\')
train_data = train_data.drop(\'Id\',axis = \'columns\')
train_data

",Yes,3,45.0 "def adding_new_features(df): df.loc[:,'mean_rooms'] = df.loc[:,'total_rooms']/df.loc[:,'households'] df.loc[:,'rooms_per_person'] = df.loc[:,'total_rooms']/df.loc[:,'population'] df.loc[:,'mean_bedrooms'] = df.loc[:,'total_bedrooms']/df.loc[:,'households'] df.loc[:,'bedrooms_per_person'] = df.loc[:,'total_bedrooms']/df.loc[:,'households'] df.loc[:,'persons_per_household'] = df.loc[:,'population']/df.loc[:,'households'] df.loc[:, 'median_income_per_person'] = df.loc[:,'median_income']/df.loc[:,'persons_per_household'] adding_new_features(train_data) adding_new_features(test_data) train_data ",No,5,8.0 "train_data['longitude'].plot(kind='hist') test_data['longitude'].plot(kind='hist') ",No,5,33.0 "train_data['latitude'].plot(kind='hist') test_data['latitude'].plot(kind='hist')",No,5,33.0 "dist2coast = pd.read_csv(""../input/distance-to-coast/dist2coast.txt"",delim_whitespace = True) ",No,5,45.0 "def saving(name,y_predict): # ""saving"" some time with a compressed writing code
df = pd.DataFrame()
df[\'Id\'] = test_Id
df.set_index(\'Id\', inplace=True)
df[\'median_house_value\'] =y_predict
print(df)
return df.to_csv(name)",Yes,4,25.0 "from sklearn import tree
x_train_data = train_data.drop(\'median_house_value\', axis = \'columns\')
y_train_data = train_data.loc[:,\'median_house_value\']
reg1 = tree.DecisionTreeRegressor(max_depth = 1)
reg1 = reg1.fit(x_train_data, y_train_data)
DTR_y = reg1.predict(test_data)
saving(""DecisionTreeRegression_1.csv"",DTR_y)
",Yes,3,7.0 "from sklearn.neighbors import KNeighborsRegressor reg2 = KNeighborsRegressor(n_neighbors=50) #50nn = 0.37494 at 70% of the test database, 1000nn = 0.38598 reg2 = reg2.fit(x_train_data, y_train_data) knnR_y = reg2.predict(test_data) saving(""50nn Regressor.csv"", knnR_y)",Yes,3,7.0 "from sklearn import linear_model reg3 = linear_model.LassoLars(alpha=.1, positive = True) reg3.fit(x_train_data, y_train_data) print (reg3.coef_) LASSO_y = reg3.predict(test_data) saving(""LASSO LARS.csv"", LASSO_y) # score = 0.38951 ",Yes,3,7.0 "from sklearn.neural_network import MLPRegressor reg4 = MLPRegressor() reg4.fit(x_train_data, y_train_data) MLP_y = reg4.predict(test_data) saving(""MultiLayerPerceptrons Regressor.csv"", MLP_y) #0.373",Yes,3,7.0 "reg5 = linear_model.BayesianRidge() reg5.fit(x_train_data, y_train_data) BRR_y = reg5.predict(test_data) scores5 = cross_val_score(reg5, x_train_data, y_train_data, cv = 10) saving(""Bayesian Ridge Regressor.csv"", BRR_y)",Yes,3,7.0 "reg7 = ExtraTreesRegressor() reg7.fit(x_train_data, y_train_data) ET_y = reg7.predict(test_data) saving(""ExtraTreesRegressor.csv"", ET_y) # scores = 0.24383 ",Yes,3,7.0 "from sklearn.ensemble import AdaBoostRegressor reg8 = AdaBoostRegressor(base_estimator = RandomForestRegressor(max_depth=20), n_estimators=50) reg8.fit(x_train_data, y_train_data) ADA_y = reg8.predict(test_data) saving(""ADA Boost Regression.csv"",ADA_y) # 0.22756",Yes,3,7.0 "reg_F = AdaBoostRegressor(base_estimator = RandomForestRegressor(max_depth=20), n_estimators=50) reg_F.fit(x_train_data, y_train_data) final = reg_F.predict(test_data) saving(""Final ADA Boost Regression.csv"",final) # 0.22756",Yes,3,7.0 "train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') sample = pd.read_csv('../input/test.csv')",No,5,45.0 "x_train = train[['LotArea','LotFrontage']].copy() y_train = train['SalePrice'].copy()",No,5,21.0 "x_test = test[['LotArea','LotFrontage']].copy()",No,5,21.0 y_train.head(),No,5,41.0 x_train.head(),No,5,41.0 x_train.shape,No,5,58.0 x_train.isnull().sum(),No,5,39.0 "x_train.fillna(0,inplace=True) x_test.fillna(0,inplace=True)",No,5,17.0 "from sklearn.linear_model import LinearRegression from sklearn.preprocessing import StandardScaler",No,5,22.0 "x_train_scaler = x_scaler.fit_transform(x_train) x_test_scaler = x_scaler.transform(x_test)",No,5,18.0 lr = LinearRegression(),No,5,4.0 "lr.fit(x_train_scaler,y_train)",No,5,7.0 pred = lr.predict(x_test_scaler),No,5,48.0 pd.read_csv('../input/sample_sumbission.csv').head(),No,5,45.0 "sub = pd.DataFrame(data = {'Id' : test.Id, 'SalePrice' :pred})",No,5,12.0 "sub.to_csv('submission.csv', index=False)",No,5,25.0 "import cv2 # working with, mainly resizing, images import numpy as np # dealing with arrays import os # dealing with directories from random import shuffle # mixing up or currently ordered data that might lead our network astray in training. train_dir = '../input/train' test_dir = '../input/test'",No,5,77.0 "X = np.array([i[0] for i in train]).reshape(-1,1,50,50) Y = [i[1] for i in train] test_x = np.array([i[0] for i in test]).reshape(-1,1,50,50) test_y = [i[1] for i in test]",No,5,21.0 "from keras.models import Sequential from keras.layers import Dense , Activation from keras.layers import Dropout from keras.layers import Flatten from keras.constraints import maxnorm from keras.optimizers import SGD from keras.layers import Convolution2D from keras.layers import Conv2D , BatchNormalization from keras.layers import MaxPooling2D from keras.utils import np_utils from keras import backend as K K.set_image_dim_ordering('th')",No,5,23.0 "# Initialising the CNN classifier = Sequential() # Step 1 - Convolution classifier.add(Convolution2D(32, 3, 3, input_shape = (1,50,50), activation = 'relu')) # Step 2 - Pooling classifier.add(MaxPooling2D(pool_size = (2, 2))) # Adding a second convolutional layer classifier.add(Convolution2D(32, 3, 3, activation = 'relu')) classifier.add(MaxPooling2D(pool_size = (2, 2))) # Adding a third convolutional layer classifier.add(Convolution2D(64, 3, 3, activation = 'relu')) classifier.add(MaxPooling2D(pool_size = (2, 2))) # Step 3 - Flattening classifier.add(Flatten()) # Step 4 - Full connection classifier.add(Dense(output_dim = 64, activation = 'relu')) classifier.add(Dropout(0.4)) classifier.add(Dense(output_dim = 2, activation = 'sigmoid')) ",No,5,4.0 "import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'png')
pd.options.display.float_format = '{:.2f}'.format
rc={'savefig.dpi': 75, 'figure.autolayout': False, 'figure.figsize': [12, 8], 'axes.labelsize': 18,\\
'axes.titlesize': 18, 'font.size': 18, 'lines.linewidth': 2.0, 'lines.markersize': 8, 'legend.fontsize': 16,\\
'xtick.labelsize': 16, 'ytick.labelsize': 16}

sns.set(style='dark',rc=rc)",Yes,4,23.0 "# Setting working directory path = '../input/' path_result = '../output/'",No,5,77.0 "train = pd.read_csv(path + \'train_data.csv\')
test = pd.read_csv(path + \'teste_data.csv\')
train = train.rename(columns={""default"": ""target"", ""ids"":""id""})
test = test.rename(columns={""ids"":""id""})",No,4,45.0 test_id = test.id,No,5,77.0 "def missing_values_table(df):
mis_val = df.isnull().sum()
mis_val_percent = 100 * df.isnull().sum() / len(df)
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : \'Missing Values\', 1 : \'% of Total Values\'})
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
\'% of Total Values\', ascending=False).round(1)
print (""Your selected dataframe has "" + str(df.shape[1]) + "" columns.\
""
""There are "" + str(mis_val_table_ren_columns.shape[0]) +
"" columns that have missing values."")
return mis_val_table_ren_columns",No,5,53.0 missing_values_table(train),No,5,39.0 "missingValueColumns = train.columns[train.isnull().any()].tolist() df_null = train[missingValueColumns]",No,5,10.0 "msno.bar(df_null,figsize=(20,8),color=default_color,fontsize=18,labels=True)",No,5,34.0 "msno.heatmap(df_null,figsize=(20,8),cmap=colormap)",No,5,80.0 "msno.dendrogram(df_null,figsize=(20,8))",No,5,34.0 train = train.dropna(subset=['target']),No,5,17.0 "plt.figure(figsize=(15,5)) ax = sns.countplot('target',data=train,color=default_color) for p in ax.patches: ax.annotate('{:.2f}%'.format(100*p.get_height()/len(train['target'])), (p.get_x()+ 0.3, p.get_height()+0.2))",No,5,33.0 "meta_data = get_meta(train) meta_data",No,5,77.0 "meta_counts = meta_data.groupby(['role', 'level']).agg({'dtype': lambda x: x.count()}).reset_index() meta_counts",No,5,60.0 "fig,ax = plt.subplots()
fig.set_size_inches(20,5)
sns.barplot(data=meta_counts[(meta_counts.role != \'target\') & (meta_counts.role != \'id\') ],x=""level"",y=""dtype"",ax=ax,color=default_color)
ax.set(xlabel=\'Variable Type\', ylabel=\'Count\',title=""Variables Count Across Datatype"")",No,5,33.0 "col_ordinal = meta_data[(meta_data.level == 'ordinal') & (meta_data.keep)].index col_nominal = meta_data[(meta_data.level == 'nominal') & (meta_data.keep)& (meta_data.role != 'target')& (meta_data.role != 'id')].index col_interval = meta_data[(meta_data.level == 'interval') & (meta_data.keep)].index col_binary = meta_data[(meta_data.level == 'binary') & (meta_data.keep) & (meta_data.role != 'target')].index",No,5,14.0 "def count_label_encoding(train, test,col): for i in col: df1 = train[i].value_counts().reset_index(name='freq_'+ i).rename(columns={'index': 'lc_'+ i}) train = pd.merge(train,df1,left_on=i, right_on='lc_'+ i, how='left') test = pd.merge(test,df1,left_on=i, right_on='lc_'+ i, how='left') for i in list(train): if 'lc_' in i: train = train.drop(i, axis = 1) test = test.drop(i, axis = 1) return train, test",No,4,10.0 "train, test = count_label_encoding(train, test,col_nominal) train, test = count_label_encoding(train, test,col_binary)",No,5,53.0 "plt.figure(figsize=(18,16)) plt.title('Pearson correlation of continuous features', y=1.05, size=15) sns.heatmap(train[col_interval].corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True, fmt = '.2f')",No,5,80.0 from sklearn.model_selection import train_test_split,No,5,22.0 "X = pd.concat([train[col_interval],train[col_ordinal],pd.get_dummies(train[col_binary])], axis=1) y = pd.DataFrame(train.target) X.fillna(-1, inplace=True) y.fillna(-1, inplace=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)",Yes,3,21.0 y.shape,No,5,58.0 X.shape,No,5,58.0 X.head(),No,5,41.0 "plt.figure(figsize=(18,16)) plt.title('Pearson correlation of continuous features', y=1.05, size=15) sns.heatmap(X.corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=False, fmt = '.1f')",No,5,80.0 "from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=150, max_depth=8, min_samples_leaf=30, max_features=0.2, n_jobs=-1, random_state=0)
rf.fit(X_train, y_train[\'target\'])
features = X_train.columns.values
print(""----- Training Done -----"")",No,5,7.0 "from sklearn.metrics import accuracy_score, roc_auc_score",No,5,22.0 "acc = accuracy_score(y_train, rf.predict(X_train)) auc = roc_auc_score(y_train, rf.predict(X_train)) print(""Accuracy: %.4f"" % acc) print(""AUC: %.4f"" % auc)",No,5,28.0 "acc = accuracy_score(y_test, rf.predict(X_test)) auc = roc_auc_score(y_test, rf.predict(X_test)) print(""Accuracy: %.4f"" % acc) print(""AUC: %.4f"" % auc)",No,5,49.0 "def get_feature_importance_df(feature_importances,
column_names,
top_n=25):
""""""Get feature importance data frame.

Parameters
----------
feature_importances : numpy ndarray
Feature importances computed by an ensemble
model like random forest or boosting
column_names : array-like
Names of the columns in the same order as feature
importances
top_n : integer
Number of top features

Returns
-------
df : a Pandas data frame

""""""

imp_dict = dict(zip(column_names,
feature_importances))
top_features = sorted(imp_dict,
key=imp_dict.get,
reverse=True)[0:top_n]
top_importances = [imp_dict[feature] for feature
in top_features]
df = pd.DataFrame(data={\'feature\': top_features,
\'importance\': top_importances})
return df",No,5,86.0 "feature_importance = get_feature_importance_df(rf.feature_importances_, features) feature_importance",No,5,86.0 "fig,ax = plt.subplots()
fig.set_size_inches(20,10)
sns.barplot(data=feature_importance[:10],x=""feature"",y=""importance"",ax=ax,color=default_color,)
ax.set(xlabel=\'Variable name\', ylabel=\'Importance\',title=""Variable importances"")",No,5,79.0 "from xgboost import XGBClassifier from lightgbm import LGBMClassifier from catboost import CatBoostClassifier from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier",No,5,22.0 "#RandomForest params rf_params = {} rf_params['n_estimators'] = 200 rf_params['max_depth'] = 6 rf_params['min_samples_split'] = 70 rf_params['min_samples_leaf'] = 30",No,5,59.0 "rf_model = RandomForestClassifier(**rf_params, random_state=29,n_jobs = -1)",No,5,4.0 "# XGBoost params xgb_params = {} xgb_params['learning_rate'] = 0.02 xgb_params['n_estimators'] = 1000 xgb_params['max_depth'] = 4 xgb_params['subsample'] = 0.9 xgb_params['colsample_bytree'] = 0.9",No,5,59.0 "XGB_model = XGBClassifier(**rf_params, random_state=29,n_jobs=-1)",No,5,4.0 "def create_extra_features(train_ext): train_ext['null_sum'] = train_ext[train_ext==-1].count(axis=1) #train_ext['bin_sum'] = train_ext[col_binary].sum(axis=1) train_ext['ord_sum'] = train_ext[col_ordinal].sum(axis=1) train_ext['interval_median'] = train_ext[col_interval].sum(axis=1) train_ext['new_amount_borrowed_by_income'] = train_ext['amount_borrowed']/train_ext['income'] train_ext['new_amount_borrowed_by_months'] = train_ext['amount_borrowed']/train_ext['borrowed_in_months'] return train_ext",No,5,8.0 ids_targets = meta_data[meta_data['role'] != 'input'].index,No,5,77.0 train_ext.head(),No,5,41.0 test_ext.head(),No,5,41.0 "train_ext.fillna(-1, inplace = True) X_ext = pd.concat([train_ext[col_interval],train_ext[col_ordinal], pd.get_dummies(train_ext[col_binary])], axis=1) X_ext.head()",Yes,3,17.0 "X_ext = X_ext.drop(columns = ['facebook_profile_False','gender_f'], axis=1)",No,5,10.0 "test_ext = pd.concat([test_ext[col_interval],test_ext[col_ordinal], pd.get_dummies(test_ext[col_binary])], axis=1) test_ext.fillna(-1, inplace = True) #X_ext = X_ext.drop(columns = ids_targets, axis =1) y_ext = pd.DataFrame(train_ext.target) #train_lc.target_default.ravel(order='K') #pd.DataFrame(train_ext.target) y_ext=y_ext.astype('bool') y_ext = y_ext.values y_ext = y_ext.reshape(-1)",Yes,3,21.0 "cols = list(X_ext) test_ext = test_ext[cols]",No,5,77.0 X_ext.head(),No,5,41.0 "from sklearn.utils.multiclass import type_of_target type_of_target(y_ext)",No,4,70.0 X_ext.shape,No,5,58.0 test_ext.shape,No,5,58.0 "X_train, X_test, y_train, y_test = train_test_split(X_ext, y_ext, test_size=0.2, random_state=42)",No,5,13.0 from sklearn.model_selection import GridSearchCV,No,5,22.0 "tuned_parameters = [{'max_depth': [4,5,6,7,8,9,10], 'max_features': [4,5,6,7,8,9,10], 'n_estimators':[10,25,50,75]}] clf = GridSearchCV(RandomForestClassifier(random_state=29), tuned_parameters, cv=3, scoring='roc_auc') clf.fit(X_train, y_train)",No,5,6.0 "from hyperopt.pyll.base import scope from hyperopt.pyll.stochastic import sample from hyperopt import STATUS_OK, Trials, fmin, hp, tpe",No,5,22.0 "import random import itertools N_HYPEROPT_PROBES = 10 EARLY_STOPPING = 80 HOLDOUT_SEED = 123456 HOLDOUT_SIZE = 0.10 HYPEROPT_ALGO = tpe.suggest # tpe.suggest OR hyperopt.rand.suggest DATASET = 'clean' # 'raw' | 'clean' | 'extended' SEED0 = random.randint(1,1000000000) NB_CV_FOLDS = 5",No,5,59.0 "obj_call_count = 0 cur_best_score = 0",No,5,77.0 "space_RF ={
\'n_estimators\' : hp.choice(\'n_estimators\', np.arange(10, 200, dtype=int)),
\'max_depth\' : hp.choice(""max_depth"", np.arange(3, 15, dtype=int)),
\'min_samples_split\' : hp.choice(""min_samples_split"", np.arange(20, 100, dtype=int)),
\'min_samples_leaf\' : hp.choice(""min_samples_leaf"", np.arange(10, 100, dtype=int)),
\'criterion\' : hp.choice(\'criterion\', [""gini"", ""entropy""]),
\'class_weight\' : hp.choice(\'class_weight\', [\'balanced_subsample\', None]),
\'n_jobs\' : -1,
\'oob_score\' : True,
\'random_state\' : hp.randint(\'random_state\',2000000)
}
#{\'class_weight\': 1, \'criterion\': 1, \'max_depth\': 9, \'min_samples_leaf\': 74, \'min_samples_split\': 12, \'n_estimators\': 134, \'random_state\': 1433254}
#Params: class_weight=balanced_subsample criterion=entropy max_depth=11 min_samples_leaf=2 min_samples_split=29 n_estimators=89 n_jobs=-1 oob_score=True
#Params: class_weight=balanced_subsample criterion=entropy max_depth=10 min_samples_leaf=2 min_samples_split=17 n_estimators=38 n_jobs=-1 oob_score=True",No,5,5.0 "space_XGB ={
\'max_depth\' : hp.choice(""max_depth"", np.arange(5, 15,dtype=int)),
\'learning_rate\' : hp.loguniform(\'learning_rate\', -4.9, -3.0),
\'n_estimators\' : hp.choice(\'n_estimators\', np.arange(10, 100,dtype=int)),
\'objective\' : \'binary:logistic\',
\'booster\' : \'gbtree\',
\'reg_alpha\' : hp.uniform(\'reg_alpha\', 1e-5, 1e-1),
\'reg_lambda\' : hp.uniform(\'reg_lambda\', 1e-5, 1e-1),
\'colsample_bytree\' : hp.uniform(\'colsample_bytree\', 0.5, 0.8),
\'min_child_weight \': hp.uniform(\'min_child_weight\', 0.5, 0.8),
\'random_state\' : hp.randint(\'random_state\',2000000)
}",No,5,5.0 "train_stack = train_backup.copy() test_stack = test_backup.copy()",No,5,12.0 train_stack.shape,No,5,58.0 test_stack.shape,No,5,58.0 "meta_data = get_meta(train_stack) col_ordinal = meta_data[(meta_data.level == 'ordinal') & (meta_data.keep)& (meta_data.role != 'target')].index col_nominal = meta_data[(meta_data.level == 'nominal') & (meta_data.keep)& (meta_data.role != 'target')].index col_interval = meta_data[(meta_data.level == 'interval') & (meta_data.keep)& (meta_data.role != 'target')].index col_binary = meta_data[(meta_data.level == 'binary') & (meta_data.keep) & (meta_data.role != 'target')].index #meta_data",No,5,14.0 "train_stack = train_stack.replace(-1, np.NaN) d_median = train_stack.median(axis=0) d_mean = train_stack.mean(axis=0) train_stack = train_stack.fillna(-1)",No,5,17.0 from sklearn import preprocessing,No,5,22.0 " train_stack = create_extra_features(train_stack) test_stack = create_extra_features(test_stack) train_stack['bin_sum'] = train_stack[col_binary].sum(axis=1) test_stack['bin_sum'] = test_stack[col_binary].sum(axis=1)",No,5,8.0 "col = [c for c in train_stack.columns if c not in ['id','target']] col = [c for c in col if not c.startswith('ps_calc_')] ## Droping ps_cal_ vars",No,5,77.0 target_stack.shape,No,5,58.0 "score = classifier.evaluate(test_x, test_y, verbose=0) print('valid loss:', score[0]) print('valid accuracy:', score[1])",No,5,49.0 "import pandas as pd aa = pd.read_csv('submission_file.csv') aa ",Yes,4,45.0 "import numpy as np import pandas as pd from fastai.conv_learner import * import os # Input data files are available in the ""../input/"" directory. # Any results you write to the current directory are saved as output. PATH = ""data/dogscats/""",No,5,77.0 "# Image size, batch size and pretrained model architecture sz=224 bs=20 arch=resnet50",No,5,77.0 "X_stack = pd.concat([train_stack[col_interval],train_stack[col_ordinal], pd.get_dummies(train_stack[col_binary])], axis=1) test_stack_val = pd.concat([test_stack[col_interval],test_stack[col_ordinal], pd.get_dummies(test_stack[col_binary])], axis=1) y_stack = target_stack",Yes,4,21.0 X_stack.shape,No,5,58.0 test_stack_val.shape,No,5,58.0 " X_stack = X_stack.drop(columns=['gender_-1','facebook_profile_-1'], axis = 1) ",No,5,10.0 "#RandomForest params
rf_params = {}
rf_params[\'n_estimators\'] = 80
rf_params[\'max_depth\'] = 12
rf_params[\'min_samples_split\'] = 50
rf_params[\'min_samples_leaf\'] = 23
#rf_params[\'class_weight\'] = ""balanced_subsample""# ""balanced"" # ""balanced_subsample""
#rf_params[\'criterion\'] = 1
#{\'class_weight\': 1, \'criterion\': 1, \'max_depth\': 10, \'min_samples_leaf\': 23, \'min_samples_split\': 88, \'n_estimators\': 66, \'random_state\': 584867}
#{\'class_weight\': 0, \'criterion\': 1, \'max_depth\': 3, \'min_samples_leaf\': 1, \'min_samples_split\': 15, \'n_estimators\': 31}
#{\'class_weight\': 1, \'criterion\': 1, \'max_depth\': 7, \'min_samples_leaf\': 1, \'min_samples_split\': 25, \'n_estimators\': 52}",No,5,59.0 "# XGBoost params xgb_params = {} xgb_params['learning_rate'] =0.03660642032718193 xgb_params['n_estimators'] = 70 xgb_params['max_depth'] = 7 xgb_params['reg_alpha'] = 0.1 xgb_params['reg_lambda'] = 0.1 xgb_params['colsample_bytree'] = 0.6162725690461764 xgb_params['min_child_weight'] = 0.751826989118936 #{'colsample_bytree': 0.6162725690461764, 'learning_rate': 0.07660642032718193, 'max_depth': 1, 'min_child_weight': 0.751826989118936, 'n_estimators': 51, 'random_state': 2943, 'reg_alpha': 8.447744027604217e-05, 'reg_lambda': 2.506380824011793e-05} #{'colsample_bytree': 0.6669680642534331, 'learning_rate': 0.0027697150000431693, 'max_depth': 2, 'min_child_weight': 0.7842089630474731, 'n_estimators': 58, 'random_state': 194789, 'reg_alpha': 6.334122926125054e-05, 'reg_lambda': 7.725227814541321e-05} #{'colsample_bytree': 0.7185209051997172, 'learning_rate': 0.09634564047154007, 'max_depth': 1, 'min_child_weight': 0.7765683660381831, 'n_estimators': 60, 'random_state': 1791482, 'reg_alpha': 1.5998181299665275e-05, 'reg_lambda': 9.446368653609355e-05} #{'colsample_bytree': 0.785981949747911, 'learning_rate': 0.07697973917507268, 'max_depth': 0, 'min_child_weight': 0.7528834859046539, 'n_estimators': 48, 'random_state': 1038594, 'reg_alpha': 9.730513129698628e-05, 'reg_lambda': 9.804649087783435e-05}",No,5,59.0 "rf_model = RandomForestClassifier(**rf_params, random_state=584867)",No,5,4.0 "xgb_model = XGBClassifier(**xgb_params, random_state=2943)",No,5,4.0 log_model = LogisticRegression(random_state=29),No,5,4.0 "stack = Ensemble(n_splits=3, stacker = log_model, base_models = (rf_model, xgb_model))",No,5,4.0 "X_stack.fillna(-1, inplace = True) test_stack_val.fillna(-1,inplace=True) y_pred = stack.fit_predict(X_stack, target_stack, test_stack_val)",Yes,4,17.0 "sub = pd.DataFrame() sub['ids'] = test_id sub['prob'] = y_pred sub.to_csv('stacked_main.csv', index=False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting graphs
import sklearn
%matplotlib inline
# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
os.listdir(""../input"")
# Any results you write to the current directory are saved as output.",No,5,88.0 category_id_df,No,5,53.0 "# The sorted function Converts dictionary items into a (sorted) list. # In subsequent steps - We will use this list to iterate over the categories sorted(category_to_id.items())",No,5,9.0 "# Store the entries into the results dataframe and name its columns BBC Ncv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])",No,5,12.0 "# Create submission file: 2 colmuns with header id, label submission = pd.DataFrame({'id':os.listdir(f'{PATH}test1'), 'label':label_probs}) submission['id'] = submission['id'].map(lambda x: x.split('.')[0]) submission['id'] = submission['id'].astype(int) submission = submission.sort_values('id') submission.to_csv('../working/submission.csv', index=False)",Yes,4,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting graphs
import sklearn
%matplotlib inline
# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input/ai-academy-intermediate-class-competition-1""))

# Any results you write to the current directory are saved as output.",No,4,88.0 "from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import cross_val_score models = [ RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0), MultinomialNB(), LogisticRegression(random_state=0), ] ",No,5,4.0 "# model.coef_ contains the importance of each feature for each category model.coef_",No,5,79.0 test_df,No,5,41.0 test_df.head(),No,5,41.0 test_df.Text.tolist(),No,5,16.0 "#translating text column into a list test_features = tfidf.transform(test_df.Text.tolist()) Y_pred = model.predict(test_features) Y_pred",Yes,3,8.0 submission,No,5,41.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
data = pd.read_csv(""../input/ai-academy-intermediate-class-competition-1/BBC News Train.csv"")
data = data[[""Text"", ""Category""]]
data
# Any results you write to the current directory are saved as output.",No,4,45.0 "vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm=\'l2\', encoding=\'latin-1\', ngram_range=(1, 2), stop_words=\'english\')
X = vectorizer.fit_transform(data[""Text""])
print(len(vectorizer.get_feature_names()))
print(X.shape)",Yes,3,8.0 "tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english') features = tfidf.fit_transform(data.Text).toarray() print(features) labels = data.category_id print(labels) features.shape",Yes,3,8.0 "from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import MultinomialNB from sklearn.model_selection import cross_val_score models = [ RandomForestClassifier(n_estimators=500, max_depth=4), MultinomialNB(), LogisticRegression(random_state=4), ] CV = 5 cv_df = pd.DataFrame(index=range(CV * len(models))) print(cv_df) entries = [] for model in models: model_name = model.__class__.__name__ accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV) print(accuracies) for fold_idx, accuracy in enumerate(accuracies): entries.append((model_name, fold_idx, accuracy)) print(entries) cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy']) print(cv_df)",Yes,3,3.0 "model = models[2] model.fit(features, labels) model.coef_",Yes,3,7.0 "test_data = pd.read_csv(""../input/bbc-test-3/BBC News Test.csv"") test_data",No,5,45.0 "submission = pd.DataFrame({ ""ArticleId"": test_data[""ArticleId""], ""Category"": submission }) submission",No,5,55.0 "submission.to_csv('submission.csv', index=False)",No,5,25.0 "import numpy as np import pandas as pd import sklearn import matplotlib.pyplot as plt",No,5,22.0 "train = pd.read_csv(""../input/train.csv"")",No,5,45.0 "Xtrain = train Xtrain = Xtrain.drop(columns=[""median_house_value""]) Xtrain.head()",Yes,4,10.0 "from sklearn import linear_model from sklearn.neighbors import KNeighborsRegressor",No,5,22.0 "ridge = linear_model.Ridge(alpha = 0.5) ridge.fit(Xtrain,Ytrain) ridge.coef_",Yes,3,7.0 "lasso = linear_model.Lasso(alpha = 0.1) lasso.fit(Xtrain,Ytrain)",Yes,4,7.0 "KNNRegression = KNeighborsRegressor(n_neighbors=52) KNNRegression.fit(Xtrain, Ytrain) ",Yes,4,7.0 "test = pd.read_csv(""../input/test.csv"") test.head()",Yes,4,45.0 "pred.to_csv(""prediction.csv"", index=False)",No,5,25.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input""))

FOLDER = ""../input/""
# Any results you write to the current directory are saved as output.",Yes,3,45.0 "import matplotlib.pyplot as plt import matplotlib.cm as cm from sklearn import linear_model, ensemble, neighbors import numpy as np import sympy as sp import pandas as pd from pylab import rcParams from sklearn.metrics import r2_score from sklearn.model_selection import cross_val_score from sklearn.feature_selection import f_regression plt.rcParams['figure.figsize'] = [20, 10] %matplotlib inline",Yes,4,22.0 dataset = pd.read_csv(FOLDER+'train.csv'),No,5,45.0 dataset.head(),No,5,41.0 dataset.mean(),No,5,40.0 dataset.std()/dataset.mean(),No,5,40.0 "def add_features(dset): mean_houses = pd.Series(dset['households']/dset['population'], name = 'mean_households') rooms_ratio = pd.Series(dset['total_rooms']/dset['total_bedrooms'], name = 'ratio' ) return pd.concat([mean_houses, rooms_ratio, dset], axis = 1)",No,5,8.0 "test = pd.read_csv(FOLDER+'test.csv') test.head()",Yes,4,45.0 "import numpy as np import pandas as pd import sklearn from matplotlib import pyplot as plt from scipy import stats as st import os import matplotlib.colors as mcolors",No,5,22.0 "from sklearn.pipeline import make_pipeline from sklearn.ensemble import BaggingRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.neural_network import MLPRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import AdaBoostRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import cross_val_score",No,5,22.0 "trainfilepath = ""../input/atividade-3-pmr3508/train.csv"" testfilepath = ""../input/atividade-3-pmr3508/test.csv""",No,5,77.0 "trainHouses = pd.read_csv(trainfilepath, sep=r'\\s*,\\s*', engine='python', na_values='?')
testHouses = pd.read_csv(testfilepath, sep=r'\\s*,\\s*', engine='python', na_values='?')",No,5,45.0 "states = pd.read_csv(""../input/averagestatecoordinates/states.csv"", sep=r\'\\s*,\\s*\', engine=\'python\', na_values=\'?\')
states.head()",Yes,4,45.0 "from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3, p=2)

#knn.fit(states.drop(""state"", axis=""columns""), states[\'state\'])
knn.fit(states[[""latitude"", ""longitude""]], states[\'state\'])


#trainHouses[""state""] = knn.predict(trainHouses.drop(trainHouses.columns.drop([""latitude"", ""longitude""]), axis=""columns""))
#prediction = knn.predict(trainHouses.drop(trainHouses.columns.drop([""latitude"", ""longitude""]), axis=""columns""))
prediction = knn.predict(trainHouses[[""latitude"", ""longitude""]])

#print(prediction.value_counts())
print(np.unique(prediction,return_counts=True))",Yes,3,7.0 "apple = (37.33182, -122.03118) SF = (37.783333, -122.416667) #trainHouses[""distance_to_apple""] = np.sqrt((trainHouses[""latitude""] - miami[0])**2 + (trainHouses[""longitude""] - miami[1])**2) #d_to_apple = np.sqrt((trainHouses[""latitude""] - apple[0])**2 + (trainHouses[""longitude""] - apple[1])**2) d_to_apple = getDistanceFromLatLng(trainHouses[""latitude""], trainHouses[""longitude""], apple[0], apple[1]) #d_to_SF = np.sqrt((trainHouses[""latitude""] - SF[0])**2 + (trainHouses[""longitude""] - SF[1])**2) d_to_SF = getDistanceFromLatLng(trainHouses[""latitude""], trainHouses[""longitude""], SF[0], SF[1]) plt.hist(d_to_apple, bins=100) plt.show() plt.hist2d(d_to_apple, trainHouses[""median_house_value""], bins=100, norm=mcolors.PowerNorm(0.15)) plt.show() plt.hist(d_to_SF, bins=100) plt.show() plt.hist2d(d_to_SF, trainHouses[""median_house_value""], bins=100, norm=mcolors.PowerNorm(0.15)) plt.show()",Yes,4,33.0 "LA = (35.0569, -118.25) Beverly_Hills = (34.073056, -118.399444) #d_to_LA = np.sqrt((trainHouses[""latitude""] - LA[0])**2 + (trainHouses[""longitude""] - LA[1])**2) d_to_LA = getDistanceFromLatLng(trainHouses[""latitude""], trainHouses[""longitude""], LA[0], LA[1]) #d_to_BH = np.sqrt((trainHouses[""latitude""] - Beverly_Hills[0])**2 + (trainHouses[""longitude""] - Beverly_Hills[1])**2) d_to_BH = getDistanceFromLatLng(trainHouses[""latitude""], trainHouses[""longitude""], Beverly_Hills[0], Beverly_Hills[1]) plt.hist(d_to_LA, bins=100) plt.show() plt.hist2d(d_to_LA, trainHouses[""median_house_value""], bins=100, norm=mcolors.PowerNorm(0.15)) plt.show() plt.hist(d_to_BH, bins=100) plt.show() plt.hist2d(d_to_BH, trainHouses[""median_house_value""], bins=100, norm=mcolors.PowerNorm(0.15)) plt.show()",Yes,4,33.0 "trainHouses[""distance_to_SF""] = d_to_SF #testHouses[""distance_to_SF""] = np.sqrt((testHouses[""latitude""] - SF[0])**2 + (testHouses[""longitude""] - SF[1])**2) testHouses[""distance_to_SF""] = getDistanceFromLatLng(testHouses[""latitude""], testHouses[""longitude""], SF[0], SF[1]) trainHouses[""distance_to_LA""] = d_to_LA #testHouses[""distance_to_LA""] = np.sqrt((testHouses[""latitude""] - LA[0])**2 + (testHouses[""longitude""] - LA[1])**2) testHouses[""distance_to_LA""] = getDistanceFromLatLng(testHouses[""latitude""], testHouses[""longitude""], LA[0], LA[1]) trainHouses[""distance_to_state_center""] = d_to_center #testHouses[""distance_to_state_center""] = np.sqrt((testHouses[""latitude""] - lat)**2 + (testHouses[""longitude""] - long)**2) testHouses[""distance_to_state_center""] = getDistanceFromLatLng(testHouses[""latitude""], testHouses[""longitude""], lat, long) trainHouses[""distance_to_beverly_hills""] = d_to_BH testHouses[""distance_to_beverly_hills""] = getDistanceFromLatLng(testHouses[""latitude""], testHouses[""longitude""], Beverly_Hills[0], Beverly_Hills[1])",No,5,8.0 trainHouses.shape,No,5,58.0 trainHouses.head(),No,5,41.0 "plt.hist(trainHouses[""median_house_value""], bins=100) plt.show()",No,5,33.0 "XtrainHouses = trainHouses.drop([""Id"", ""median_house_value""], axis=""columns"") XtrainHousesB = trainHouses.drop([""Id"", ""median_house_value"", ""median_age"", ""total_rooms"", ""total_bedrooms"", ""population"", ""households""], axis=""columns"") YtrainHouses = trainHouses[""median_house_value""] XtestHouses = testHouses.drop(""Id"", axis=""columns"") XtestHousesB = testHouses.drop([""Id"", ""median_age"", ""total_rooms"", ""total_bedrooms"", ""population"", ""households""], axis=""columns"")",Yes,4,10.0 "#a = trainHouses[trainHouses[""median_house_value""].transform(lambda x: x<=500000)] #a = trainHouses[trainHouses[""median_house_value""] <=500000] plt.hist(trainHouses[trainHouses[""median_house_value""] <=500000][""median_house_value""], bins=100) plt.show() culledTrainHouses = trainHouses[trainHouses[""median_house_value""] <=500000] XculledTrainHouses = culledTrainHouses.drop([""Id"", ""median_house_value""], axis=""columns"") YculledTrainHouses = culledTrainHouses.median_house_value ",Yes,3,33.0 "import pandas as pd import sklearn import os import numpy as np import matplotlib.pyplot as plt from sklearn.neighbors import KNeighborsRegressor from sklearn.model_selection import cross_val_score from sklearn.metrics import mean_squared_log_error from sklearn import linear_model from sklearn.ensemble import RandomForestRegressor from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import AdaBoostRegressor ",Yes,4,22.0 "train_data = pd.read_csv('../input/train.csv', engine='python') test_data = pd.read_csv('../input/test.csv', engine='python')",No,5,45.0 "train_data.head() ",No,5,41.0 train_data.shape,No,5,58.0 train_data.info(),No,5,40.0 "Xtrain = train_data Xtrain = Xtrain.drop('Id', axis=1) Xtrain = Xtrain.drop('median_house_value', axis=1) Ytrain = train_data['median_house_value']",Yes,4,21.0 "knn = KNeighborsRegressor(n_neighbors=10) knn.fit(Xtrain, Ytrain) scores = cross_val_score(knn, Xtrain, Ytrain, cv=10) Ypred = knn.predict(Xtrain) scores.mean()",Yes,3,7.0 "rmsle(Ytrain, Ypred)",No,5,28.0 "reg = linear_model.Lasso(alpha = 0.1) reg.fit(Xtrain, Ytrain) scores = cross_val_score(reg, Xtrain, Ytrain, cv=10) Ypred=reg.predict(Xtrain) scores.mean()",Yes,3,7.0 "rid = linear_model.Ridge(alpha = 0.5) rid.fit(Xtrain, Ytrain) scores = cross_val_score(rid, Xtrain, Ytrain, cv=10) Ypred = rid.predict(Xtrain) scores.mean()",Yes,3,7.0 "bay = linear_model.BayesianRidge() bay.fit(Xtrain, Ytrain) scores = cross_val_score(bay, Xtrain, Ytrain, cv=10) Ypred = bay.predict(Xtrain) scores.mean()",Yes,3,7.0 "lars = linear_model.LassoLars(alpha = 0.1) lars.fit(Xtrain, Ytrain) scores = cross_val_score(lars, Xtrain, Ytrain, cv=10) Ypred = lars.predict(Xtrain) scores.mean()",Yes,3,7.0 "california_sea=[(41.990352, -124.216535),(41.936725, -124.199048),(41.862157, -124.220161),(41.758672, -124.240793),(41.730317, -124.162807),(41.672629, -124.139878),(41.722746, -124.151351),(41.671813, -124.136762),(41.618963, -124.109252),(41.470737, -124.072740),(41.383226, -124.066948),(41.308172, -124.094492),(41.212278, -124.121904), (41.137176, -124.165918),(41.062165, -124.165618),(41.020596, -124.115740),(40.928851, -124.143028),(40.858028, -124.126245),(40.812048, -124.181163),(40.728511, -124.235831),(40.649059, -124.301387),(40.586325, -124.344954),(40.511043, -124.388365),(40.440002, -124.409806),(40.395399, -124.383960),(40.322914, -124.349643),(40.241803, -124.337706),(40.186635, -124.253402),(40.122885, -124.169203),(40.067673, -124.068499),(40.008009, -124.029231), (39.922813, -123.945453),(39.837566, -123.873007),(39.735216, -123.828474),(39.654186, -123.789622),(39.564619, -123.761930),(39.399528, -123.821626),(39.201588, -123.770073),(39.076989, -123.691566),(38.960637, -123.724138),(38.879044, -123.662811),(38.754580, -123.507611),(38.634199, -123.386034),(38.496411, -123.193367),(38.336876, -123.061865),(38.259117, -122.974368), (38.151338, -122.952917),(38.060918, -122.980669),(37.996318, -123.002792), (38.026254, -122.926130),(38.004306, -122.827828),(37.931906, -122.744687),(37.902923, -122.652017),(37.872444, -122.594173),(37.880984, -122.392446), (37.815555, -122.367515),(37.628327, -122.331577),(37.542968, -122.455670), (37.370235, -122.414093),(37.290236, -122.415691),(37.167091, -122.356855), (37.088046, -122.276348),(36.987005, -122.157357),(36.951905, -122.049790), (36.969554, -121.914753),(36.925477, -121.862435),(36.824092, -121.802024), (36.620740, -121.851334),(36.480625, -121.934216),(36.282719, -121.866908), (36.162592, -121.678018),(35.990860, -121.498031),(35.827849, -121.382193), (35.671399, -121.272296),(35.608589, -121.143265),(35.453082, -120.919491), (35.297750, -120.877400),(35.189759, -120.819107),(35.180890, -120.736397), (35.097645, -120.628863),(34.932680, -120.660285),(34.842040, -120.610177), (34.742216, -120.618143),(34.583391, -120.639685),(34.528043, -120.518413), (34.457687, -120.472919),(34.458791, -120.347644),(34.469789, -120.138306), (34.422313, -119.903627),(34.399196, -119.699791),(34.408922, -119.552255),(34.335795, -119.408499),(34.288024, -119.329889),(34.199208, -119.247261),(34.115993, -119.153777),(34.041474, -118.899965),(34.035682, -118.855901),(34.018486, -118.822894),(34.003602, -118.805037),(34.016106, -118.785710), (34.029683, -118.744327),(34.037409, -118.667109),(34.036912, -118.580005), (34.009365, -118.502919),(33.984242, -118.472597),(33.960222, -118.454035),(33.867022, -118.402873),(33.810913, -118.390523),(33.770287, -118.420867),(33.716625, -118.060214),(33.606537, -117.889392),(33.385674, -117.578771),(33.270497, -117.443285),(33.127431, -117.326314),(33.053581, -117.291643),(32.831417, -117.277875),(32.683026, -117.189643),(32.536805, -117.122224)]",No,5,77.0 "train_data3 = train_data2
train_data3[""rooms_per_household""] = train_data3[""total_rooms""]/train_data3[""households""]
train_data3[""bedrooms_per_room""] = train_data3[""total_bedrooms""]/train_data3[""total_rooms""]
train_data3[""population_per_household""] = train_data3[""population""]/train_data3[""households""]
train_data3[""income_per_person""] = train_data3[""median_income""]/train_data3[""population_per_household""]
train_data3[\'mean_rooms\'] = train_data3[\'total_rooms\']/train_data3[\'households\']
train_data3[\'rooms_per_person\'] = train_data3[\'total_rooms\']/train_data3[\'population\']
train_data3[\'mean_bedrooms\'] = train_data3[\'total_bedrooms\']/train_data3[\'households\']
train_data3[\'bedrooms_per_person\'] = train_data3[\'total_bedrooms\']/train_data3[\'households\']
train_data3[\'persons_per_household\'] = train_data3[\'population\']/train_data3[\'households\']
train_data3[\'total_income\'] = train_data3[\'median_income\']*train_data3[\'households\']",No,5,8.0 "d = 8 n = 100 boost = AdaBoostRegressor(DecisionTreeRegressor(max_depth=d), n_estimators=n) boost.fit(Xtrain3, Ytrain) Ypred = boost.predict(Xtrain3) boost_rmsle = rmsle(Ytrain, Ypred) print('error =', boost_rmsle)",Yes,3,7.0 "from sklearn.model_selection import train_test_split X_train, X_validation, y_train, y_validation = train_test_split(Xtrain3, Ytrain, train_size=0.7) from catboost import CatBoostRegressor model=CatBoostRegressor(iterations=200, depth=6, learning_rate=0.2, loss_function='RMSE') model.fit(Xtrain3, Ytrain,eval_set=(X_validation, y_validation),plot=True) Ypred = model.predict(Xtrain3) model_rmsle = rmsle(Ytrain, Ypred) print('error =', model_rmsle)",Yes,2,7.0 "Xtest2 = test_data2.drop('Id', axis=1)",No,5,10.0 "forest = RandomForestRegressor(max_depth=21, random_state=0, n_estimators=1000) forest.fit(Xtrain3, Ytrain) Ypred = forest.predict(Xtrain3) forest_rmsle = rmsle(Ytrain, Ypred) print('log error =', forest_rmsle) prediction = forest.predict(Xtest2)",Yes,3,7.0 "import numpy as np import pandas as pd import sklearn import matplotlib.pyplot as plt import os print(os.listdir(""../input"")) ",No,5,88.0 "train = pd.read_csv(""../input/train.csv"") test = pd.read_csv(""../input/test.csv"") Xtrain = train.drop(columns=[""Id"",""median_house_value""]) Ytrain = train[""median_house_value""] ",Yes,3,45.0 Xtrain.describe(),No,5,40.0 YPredict.describe(),No,5,40.0 "pd.DataFrame({""Id"":ID_list,""median_house_value"":YPredict}).to_csv(""pred_R.csv"",index=False)",No,5,25.0 "import numpy as np import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt import warnings warnings.filterwarnings(""ignore"")",No,5,23.0 "data = pd.read_csv(""../input/atividade-3-pmr3508/train.csv"") data.head()",Yes,4,45.0 data.shape,No,5,58.0 "newData = data.drop([""Id"",""latitude"",""longitude""],axis=1)",No,5,10.0 "newData[""avg_rooms""] = newData.total_rooms/newData.households newData[""avg_bedrooms""] = newData.total_bedrooms/newData.households newData[""avg_inhabitants""] = newData.population/newData.households",No,5,8.0 "from sklearn.neighbors import KNeighborsRegressor from sklearn.model_selection import GridSearchCV",No,5,22.0 "knn = KNeighborsRegressor() knn.get_params()",Yes,4,4.0 "param_grid = {""n_neighbors"":[i for i in range(1,31)],""weights"":[""uniform"",""distance""],""p"":[1,2]} grid = GridSearchCV(knn,param_grid,cv=10)",Yes,4,5.0 "grid.fit(xTrain,yTrain) print(grid.best_estimator_) print(grid.best_score_)",No,5,6.0 "ridge = Ridge() ridge.get_params()",Yes,4,4.0 "param_grid2 = {""alpha"":np.linspace(0.5,10.5,101).tolist()} grid2 = GridSearchCV(ridge,param_grid2,cv=10)",Yes,4,5.0 "grid2.fit(xTrain,yTrain) print(grid2.best_estimator_) print(grid2.best_score_)",No,5,6.0 from sklearn.linear_model import Lasso,No,5,22.0 "lasso = Lasso() lasso.get_params()",Yes,4,4.0 "param_grid3 = {""alpha"":np.linspace(0.5,5.5,51).tolist(),""normalize"":[True,False]} grid3 = GridSearchCV(lasso,param_grid3,cv=10)",Yes,4,5.0 "grid3.fit(xTrain,yTrain) print(grid3.best_estimator_) print(grid3.best_score_)",No,5,6.0 "testRaw = pd.read_csv(""../input/atividade-3-pmr3508/test.csv"") ID_list = testRaw.Id.tolist() testRaw[""avg_rooms""] = testRaw.total_rooms/testRaw.households testRaw[""avg_bedrooms""] = testRaw.total_bedrooms/testRaw.households testRaw[""avg_inhabitants""] = testRaw.population/testRaw.households testData = testRaw.drop([""Id"",""latitude"",""longitude""],axis=1) testData.head()",Yes,3,8.0 "knn.fit(xTrain,yTrain) pred_knn = knn.predict(testData).tolist()",Yes,4,7.0 "pd.DataFrame({""Id"":ID_list,""median_house_value"":pred_knn}).to_csv(""pred_knn.csv"",index=False)",No,5,25.0 "ridge.fit(xTrain,yTrain) pred_ridge = ridge.predict(testData).tolist()",Yes,4,7.0 "pd.DataFrame({""Id"":ID_list,""median_house_value"":pred_ridge}).to_csv(""pred_ridge.csv"",index=False)",No,5,25.0 "lasso.fit(xTrain,yTrain) pred_lasso = lasso.predict(testData).tolist()",Yes,4,7.0 "pd.DataFrame({""Id"":ID_list,""median_house_value"":pred_lasso}).to_csv(""pred_lasso.csv"",index=False)",No,5,25.0 "import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from geopy.distance import distance from geopy.distance import vincenty from sklearn.metrics import make_scorer from sklearn.model_selection import cross_validate from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import GridSearchCV from sklearn.linear_model import Lasso from sklearn.ensemble import RandomForestClassifier",No,5,22.0 "Train = pd.read_csv(""../input/californianhouses/train.csv"",
sep=r\'\\s*,\\s*\',
engine=\'python\',
na_values="""")
Test = pd.read_csv(""../input/californianhouses/test.csv"",
sep=r\'\\s*,\\s*\',
engine=\'python\',
na_values="""")",No,5,45.0 Train.head(),No,5,41.0 # Id no um critrio para estimar a varivel desejada
Train = Train.drop(columns=['Id']),No,5,10.0 plt.title('Matriz de correlao')
sns.heatmap(mcorr),No,5,80.0 "scaler = MinMaxScaler() selected_columns = ['median_income', 'total_rooms','population','median_age'] SC = scaler.fit_transform(Train[selected_columns]) x_train, x_test, y_train, y_test = train_test_split(SC, Train['median_house_value'], test_size=0.20)",Yes,4,18.0 "def rmsle(y_test, y_pred): return np.sqrt(np.mean((np.log(y_pred+1) - np.log(y_test+1))**2)) reg = LinearRegression() scorer = make_scorer(rmsle, greater_is_better=False) reg.fit(x_train, y_train) y_pred = reg.predict(x_test) print(""RMSLE: "" + str(rmsle(y_pred, y_test)))",Yes,3,7.0 "param_grid = dict(n_neighbors=list(range(1,15))) neigh = KNeighborsClassifier() grid_obj = GridSearchCV(neigh, param_grid, scoring=scorer, cv=5) grid_obj.fit(x_train, y_train) grid_obj.best_params_",Yes,3,7.0 "neigh = KNeighborsClassifier(n_neighbors=1) neigh.fit(x_train, y_train) y_pred = neigh.predict(x_test) print(""RMSLE: "" + str(rmsle(y_pred, y_test)))",Yes,3,7.0 "las = Lasso() param_grid = dict(alpha=np.divide(list(range(1,100)),100)) grid_obj = GridSearchCV(las, param_grid, scoring=scorer, cv=5) grid_obj.fit(x_train, y_train) grid_obj.best_params_",Yes,2,7.0 "las = Lasso(alpha=0.21) las.fit(x_train, y_train) y_pred = las.predict(x_test) print(""RMSLE: "" + str(rmsle(y_pred, y_test)))",Yes,3,7.0 "rfc = RandomForestClassifier(n_estimators=50, max_depth=35, random_state=0) rfc.fit(x_train, y_train) y_pred = rfc.predict(x_test) print(""RMSLE: "" + str(rmsle(y_pred, y_test))) ",Yes,3,7.0 "dfTest = Test.drop(['longitude', 'latitude', 'households', 'total_bedrooms'], axis=1) dfTest.head()",Yes,4,10.0 dfTest.shape,No,5,58.0 "selected_model = rfc x_val_test = scaler.transform(dfTest[selected_columns]) y_val_test = selected_model.predict(x_val_test) dfSave = pd.DataFrame(data={""Id"" : dfTest[""Id""], ""median_house_value"" : y_val_test}) pd.DataFrame(dfSave[[""Id"", ""median_house_value""]], columns = [""Id"", ""median_house_value""]).to_csv(""Output.csv"", index=False) ",Yes,3,25.0 # Importao das bibliotecas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
import math
import seaborn as sns
,No,5,23.0 "#Lendo a base de treino
traindata = pd.read_csv(""../input/california-houses/train.csv"",
sep=r\'\\s*,\\s*\',
engine=\'python\',
na_values=""?"")",No,5,45.0 "#Modificando a visalizao da base de treino
traindata.iloc[0:20,:]",No,5,14.0 traindata = traindata.drop(columns = ['Id']),No,5,10.0 traindata.info(),No,5,40.0 "# Grficos de barra das variveis
traindata.hist(bins=50, figsize=(20,20))
plt.show()",No,5,33.0 "plt.figure(figsize=(6,6))
plt.title('Matriz de correlao')
sns.heatmap(traindata.corr(), annot=True, linewidths=0.1)",No,5,80.0 "knn_trainX = traindata[[""longitude"",""total_rooms"",""total_bedrooms"",""population"",""households"",""median_income""]] ",No,5,12.0 "from sklearn.neighbors import KNeighborsRegressor neighbor = KNeighborsRegressor(n_neighbors=2) neighbor.fit(knn_trainX,trainY) knn_predict = neighbor.predict(knn_trainX) df_knn = pd.DataFrame({'Y_real':trainY[:],'Y_pred':knn_predict[:]}) print(rmsle(df_knn.Y_real,df_knn.Y_pred))",Yes,2,7.0 "def rooms_pop(row):
row['rooms_pop'] = row['total_rooms'] / row['population']
return row
traindata = traindata.apply(rooms_pop, axis=1)
traindata = traindata.drop(['population'], axis=1)
plt.figure(figsize=(6,6))
plt.title('Matriz de correlao')
sns.heatmap(traindata.corr(), annot=True, linewidths=0.1)",Yes,3,80.0 "def age_rooms(row):
row['age_rooms'] = row['median_age'] / row['total_rooms']
return row
traindata = traindata.apply(age_rooms, axis=1)
traindata = traindata.drop(['median_age'], axis=1)
plt.figure(figsize=(6,6))
plt.title('Matriz de correlao')
sns.heatmap(traindata.corr(), annot=True, linewidths=0.1)",Yes,3,80.0 "traindata = traindata.drop(columns = ['latitude','longitude'])",No,5,10.0 "newknn_trainX = traindata[[""per_capita"",""total_bedrooms"",""rooms_pop"",""households"",""age_rooms""]]
neighbor = KNeighborsRegressor(n_neighbors=2)
neighbor.fit(newknn_trainX,trainY)
knn_predict = neighbor.predict(newknn_trainX)
df_knn = pd.DataFrame({\'Y_real\':trainY[:],\'Y_pred\':knn_predict[:]})
print(rmsle(df_knn.Y_real,df_knn.Y_pred))",Yes,2,7.0 "x_val_test = testX
y_val_test = neighbor.predict(x_val_test)

dfSave = pd.DataFrame(data={""Id"" : testdata[""Id""], ""median_house_value"" : y_val_test})
dfSave[\'Id\'] = dfSave[\'Id\'].astype(int)
pd.DataFrame(dfSave[[""Id"", ""median_house_value""]], columns = [""Id"", ""median_house_value""]).to_csv(""Output.csv"", index=False)
dfSave.head()",Yes,3,25.0 "%matplotlib inline

from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

import os

df = pd.read_csv(""../input/atividade-3-pmr3508/train.csv"",
sep=r\'\\s*,\\s*\',
engine=\'python\',
na_values=""?"")
df = df.replace(np.nan,\' \', regex=True)",No,3,45.0 df.describe(),No,5,40.0 "X[""coord""] = X[""latitude""].map(str) + ', ' + X[""longitude""].map(str)",No,5,8.0 "cities = pd.DataFrame(address.Id)
cities = cities.set_index(\'Id\')
cities[\'city\'] = \'\'

for index, row in address.iterrows():
city = row[\'location\'][-2]
row_id = row[\'Id\']
cities.at[row_id, \'city\'] = city.lstrip(\' \')

cities.to_csv(""cities.csv"", index=False)",Yes,4,25.0 "cities = pd.read_csv(""../input/california-prices/cities.csv"",
sep=r\'\\s*""\\s*\',
engine=\'python\')
X[\'cities\'] = cities[\'city\']",Yes,4,45.0 "calPrices = pd.read_csv(""../input/california-prices/calif.csv"",
sep=r\'\\s*,\\s*\\s*""\',
engine=\'python\')


calPrices = calPrices.replace(\'---\',\' \', regex=True)
calPrices.columns = [col.replace(\'""\', \'\') for col in calPrices.columns]
calPrices = calPrices.filter([\'Region Name\', \'Current\'])
calPrices = calPrices.replace(\'""\',\'\', regex=True)
calPrices = calPrices.replace(\'\\$\',\'\', regex=True)
calPrices = calPrices.replace(\',\',\'\', regex=True)

calPrices = calPrices.drop(calPrices.index[0])",Yes,4,78.0 calPrices.head(),No,5,41.0 "X['people_pb'] = X.population/X.total_bedrooms X['people_ph'] = X.population/X.households X['income_pr'] = X.median_income/X.total_rooms",No,5,8.0 "X = X.replace('', np.NaN) X = X.replace(' ', np.NaN) X = X.dropna() X = X.drop(['Id','longitude','latitude','coord'], axis = 1)",Yes,4,17.0 "X_train = X.filter(['median_age', 'total_rooms','total_bedrooms', 'population', 'households', 'median_income', 'city_price', 'people_pb','people_ph', 'income_pr'], axis = 1) X_train.describe()",Yes,4,14.0 "from sklearn import preprocessing scaler = preprocessing.StandardScaler().fit(X_train) X_train_scaled = scaler.transform(X_train)",No,5,18.0 "from sklearn import linear_model from sklearn.model_selection import cross_val_score from sklearn.metrics import mean_squared_error reg = linear_model.LinearRegression().fit(X_train_scaled, Y) reg.score(X_train_scaled, Y)",Yes,4,7.0 "from sklearn import neighbors knn = neighbors.KNeighborsRegressor(n_neighbors=6) knn.fit(X_train_scaled, Y) knn_scores = cross_val_score(knn, X_train_scaled, Y, cv=10) np.mean(knn_scores)",Yes,3,7.0 "from sklearn import ensemble params = {'n_estimators': 500, 'max_depth': 5, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'ls'} gbr = ensemble.GradientBoostingRegressor(**params) gbr.fit(X_train_scaled, Y)",Yes,4,7.0 "Yreg = reg.predict(X_train_scaled) Yknn = knn.predict(X_train_scaled) Ygbr = gbr.predict(X_train_scaled)",No,5,27.0 "from sklearn.metrics import mean_squared_log_error mean_squared_log_error(Y, Yknn) ",No,5,28.0 "mean_squared_log_error(Y, Ygbr) ",No,5,28.0 "testdf = pd.read_csv(""../input/atividade-3-pmr3508/test.csv"",
sep=r\'\\s*,\\s*\',
engine=\'python\',
na_values=""?"")
testdf = testdf.replace(np.nan,\' \', regex=True)",Yes,4,45.0 "testdf[""coord""] = testdf[""latitude""].map(str) + \', \' + testdf[""longitude""].map(str)",No,5,8.0 "texts = [""Hooli stock price soared after a dip in PiedPiper revenue growth."",
""Captain Tsubasa scores a magnificent goal for the Japanese team."",
""Merryweather mercenaries are sent on another mission, as government oversight groups call for new sanctions."",
""Beyonc releases a new album, tops the charts in all of south-east Asia!"",
""You won\'t guess what the latest trend in data analysis is!""]
text_features = tfidf.transform(texts)
predictions = model.predict(text_features)
for text, predicted in zip(texts, predictions):
print(\'""{}""\'.format(text))
print("" - Predicted as: \'{}\'"".format(id_to_category[predicted]))
print("""")",Yes,4,8.0 "test_loc = pd.read_csv(""../input/california-prices/test_loc.csv"",
sep=r\'\\s*""\',
engine=\'python\')

test_loc.columns = [col.replace(\'""\', \'\') for col in test_loc.columns]
test_loc = test_loc.replace(\'""\',\'\', regex=True)

test_loc[\'location\'] = test_loc[\'location\'].str.split("","")

test_loc = test_loc.replace(\',\',\'\', regex=True)",Yes,4,78.0 "#import os print(os.listdir(""../input/bbc-test""))",No,5,88.0 "TEST_PATH = os.path.join(""../input/bbc-test"", ""BBC News Test (1).csv"") #Load the data using pandas : Create a DataFrame test_df = pd.read_csv(TEST_PATH) ",No,5,45.0 "test_cities = pd.DataFrame(test_loc.Id)
test_cities = test_cities.set_index(\'Id\')
test_cities[\'city\'] = \'\'

for index, row in test_loc.iterrows():
row_id = row[\'Id\']
city = row[\'location\'][-2]
if city != \'n\':
test_cities.at[row_id, \'city\'] = city.lstrip(\' \')

test_cities.to_csv(""test_cities.csv"", index=False)",Yes,4,25.0 "test_cities = pd.read_csv(""../input/california-prices/test_cities.csv"",
engine=\'python\')

testdf[\'cities\'] = test_cities[\'city\']
",Yes,4,45.0 "import pandas as pd import numpy",No,5,22.0 "data = pd.read_csv(""../input/datasetss/train.csv"")",No,5,45.0 "X_test = testdf.copy() X_test['people_pb'] = X_test.population/X_test.total_bedrooms X_test['people_ph'] = X_test.population/X_test.households X_test['income_pr'] = X_test.median_income/X_test.total_rooms X_test = X_test.filter(['median_age','total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'city_price', 'people_pb','people_ph', 'income_pr'], axis = 1) X_test.describe()",Yes,3,8.0 data,No,5,41.0 X_test_scaled = scaler.transform(X_test),No,5,18.0 Ypred = knn.predict(X_test_scaled),No,5,48.0 "prediction.to_csv(""prediction.csv"", index=False)",No,5,25.0 "data = data.drop('Id', axis=1)",No,5,10.0 "import pandas as pd import matplotlib.pyplot as plt import numpy as np import sklearn as skl from sklearn.neighbors import KNeighborsRegressor as KNR from sklearn.tree import DecisionTreeRegressor as DTR from sklearn.linear_model import Lasso, LassoCV from sklearn.model_selection import cross_val_score as cvs",No,5,22.0 data.isna().sum(),No,5,39.0 "arquivo1 = '../input/test.csv' tester = pd.read_csv(arquivo1, engine = 'python') tester.shape",Yes,4,45.0 "X_train = data.drop('median_house_value', axis=1) y_train = data.median_house_value",No,5,21.0 "from sklearn import linear_model from sklearn.model_selection import cross_val_score",No,5,22.0 "arquivo2 = '../input/train.csv' trainer = pd.read_csv(arquivo2, engine = 'python') trainer.shape",Yes,4,45.0 trainer.head(),No,5,41.0 "best_knr = KNR(n_neighbors=melhor_knr(train,5,50)[1])",No,4,4.0 "best_knr.fit(train,trainer['median_house_value'])",No,5,7.0 knr_pred = best_knr.predict(test),No,5,48.0 "Submit1 = pd.DataFrame() Submit1.insert(0, 'Id', tester['Id']) Submit1.insert(1,'median_house_value', knr_pred)",Yes,4,12.0 best_tree = DTR(max_depth=10),No,5,4.0 "best_tree.fit(train,target)",No,5,7.0 "tree_pred = best_tree.predict(test) tree_pred",No,5,48.0 "lcv = LassoCV().fit(train, target) lcv.score(train, target)",Yes,4,7.0 "lasso = Lasso(max_iter = 100000, selection = 'random') lasso.fit(train, target) pred_lasso = lasso.predict(test) pred_lasso",Yes,3,7.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.sparse import csr_matrix
from tqdm import tqdm
from sklearn.datasets import load_svmlight_file
# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir(""../input/movie-ratings""))

# Any results you write to the current directory are saved as output.",No,5,88.0 "training_labels[training_labels == 0] = -1 testing_labels[testing_labels == 0] = -1",No,5,8.0 "from sklearn.model_selection import cross_val_score, GridSearchCV clf = GridSearchCV(AveragePerceptron(), param_grid={'learning_rate':[1, 0.1, 0.01], 'margin':[ 0, 0.1], 'decay': [False, True], 'epochs':[50], 'avg_decay':[False, True]}, cv=10, scoring='accuracy', n_jobs=-1) clf.fit(testing_data, testing_labels)",No,5,6.0 "clf.best_estimator_.score(testing_data, testing_labels)",No,5,49.0 "with open('submission.csv', 'w') as submission:
with open('../input/movie-ratings/movie-ratings/data-splits/data.eval.anon.id', 'r') as example_ids:
submission.write('example_id,label\
')
for example_id, label in zip(example_ids, submission_pred):
submission.write('{},{}\
'.format(example_id.strip(), int(label)))",No,4,25.0 "%matplotlib inline import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from fastai.structured import * import matplotlib.pyplot as plt from sklearn.metrics import mean_squared_error from math import sqrt from sklearn.ensemble import RandomForestRegressor from IPython.core.debugger import set_trace from sklearn.model_selection import KFold import os print(os.listdir(""../input"")) PATH = ""../input/""",No,4,88.0 "df_train = pd.read_csv(f'{PATH}train.csv', parse_dates=['Open Date']) df_test = pd.read_csv(f'{PATH}test.csv', parse_dates=['Open Date']) df_joined = pd.concat([df_train.drop('revenue', axis=1), df_test], axis=0)",Yes,4,45.0 "X_train, X_test = prcs(df_joined.copy()) y_train = df_train['revenue'].copy().apply(np.log)",No,5,21.0 "m = RandomForestRegressor(n_jobs=-1, n_estimators=150, oob_score=True, max_features=0.5) m.fit(X_train, y_train) score(m,X_train, y_train)",Yes,4,7.0 "df_preds = pd.DataFrame(columns=['Prediction'],index=X_test.index, data=np.exp(predict(m, X_test))) df_preds.to_csv('submission0.csv', index=True, index_label='Id') df_preds.head()",Yes,4,25.0 "df_preds = pd.DataFrame(columns=['Prediction'],index=X_test.index, data=np.exp(predict(models, X_test))) df_preds.to_csv('submission1.csv', index=True, index_label='Id') df_preds.head()",Yes,4,25.0 "X_train, X_test = prcs(df_joined.copy(), fe=['id'])

# Doble transformacin para que la distribucin sea Normal
y_train = df_train['revenue'].copy().apply(np.sqrt).apply(np.log)",No,5,21.0 "df_preds = pd.DataFrame(columns=['Prediction'],index=X_test.index, data=np.square(np.exp(predict(models, X_test)))) df_preds.to_csv('submission2.csv', index=True, index_label='Id') df_preds.head()",Yes,4,25.0 "X_train, X_test = prcs(df_joined.copy(), fe=['id', 'dummies'])

# Doble transformacin para que la distribucin sea Normal
y_train = df_train['revenue'].copy().apply(np.sqrt).apply(np.log)",No,5,21.0 "df_preds = pd.DataFrame(columns=['Prediction'],index=X_test.index, data=np.square(np.exp(predict(models, X_test)))) df_preds.to_csv('submission3.csv', index=True, index_label='Id') df_preds.head()",Yes,4,25.0 "X_train, X_test = prcs(df_joined.copy(), fe=['id', 'dummies', 'city'])

# Doble transformacin para que la distribucin sea Normal
y_train = df_train['revenue'].copy().apply(np.sqrt).apply(np.log)",No,5,21.0 "df_preds = pd.DataFrame(columns=['Prediction'],index=X_test.index, data=np.square(np.exp(predict(models, X_test)))) df_preds.to_csv('submission4.csv', index=True, index_label='Id') df_preds.head()",Yes,4,25.0 "X_train, X_test = prcs(df_joined.copy(), fe=['id', 'dummies', 'city', 'city_group'])

# Doble transformacin para que la distribucin sea Normal
y_train = df_train['revenue'].copy().apply(np.sqrt).apply(np.log)",No,5,21.0 "df_preds = pd.DataFrame(columns=['Prediction'],index=X_test.index, data=np.square(np.exp(predict(models, X_test)))) df_preds.to_csv('submission5.csv', index=True, index_label='Id') df_preds.head()",Yes,4,25.0 "import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline import warnings warnings.filterwarnings(""ignore"")",No,5,23.0 "t = pd.read_csv('../input/training/training.csv') ts = pd.read_csv('../input/test/test.csv')",No,5,45.0 t.shape[0],No,5,58.0 "Y = np.array(t.drop('Image', axis=1).fillna(method='ffill'),dtype=float)",No,5,21.0 "#import packages from keras.models import Sequential from keras.layers import Dense, Activation, Dropout, Conv2D, MaxPool2D,Flatten, LeakyReLU #from keras.layers import LeakyReLU(alpha=0.3) as activation",No,5,22.0 "# Set the CNN model
# my CNN architechture is In -> [[Conv2D->relu]*2 -> MaxPool2D -> Dropout]*2 -> Flatten -> Dense -> Dropout -> Out

model = Sequential()

model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = \'Same\',
activation =\'linear\', input_shape = (96,96,1)))
model.add(LeakyReLU(alpha=.001))
model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(256, activation = ""linear""))
model.add(LeakyReLU(alpha=.001))
model.add(Dropout(0.5))
model.add(Dense(128, activation = ""linear""))
model.add(LeakyReLU(alpha=.001))
model.add(Dropout(0.5))
model.add(Dense(30))",No,5,84.0 "model.fit(X, Y, epochs=100, batch_size=128,validation_split = 0.2)",No,5,7.0 Y_ts = model.predict(X_ts),No,5,48.0 "look_id = pd.read_csv('../input/IdLookupTable.csv') look_id.drop('Location',axis=1,inplace=True)",Yes,4,45.0 look_id['location_id'] = look_id.FeatureName.map(maps),No,5,20.0 "look_id[['RowId','Location']].to_csv('Sub1.csv',index=False)",No,5,25.0 "%matplotlib inline import os import pandas as pd, numpy as np import matplotlib.pyplot as plt from skimage.io import imread import seaborn as sns # nice visuals from sklearn.model_selection import train_test_split # splitting data # quantifying models from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score, confusion_matrix data_dir = '../input/'",No,5,77.0 "trn_image, vld_image, trn_label , vld_label = train_test_split(full_train_df['image'], full_train_df['opacity'], test_size=0.25, random_state=2018) trn_image = np.stack(trn_image, 0) vld_image = np.stack(vld_image, 0)",Yes,3,13.0 "out_model = models.Sequential() out_model.add(layers.Reshape((64, 64, 1), input_shape=trn_image.shape[1:])) out_model.add(layers.Conv2D(16, (3, 3), padding='valid', activation='relu')) out_model.add(layers.MaxPool2D((2, 2))) out_model.add(layers.Conv2D(32, (3, 3), padding='valid', activation='relu')) out_model.add(layers.MaxPool2D((2, 2))) out_model.add(layers.Conv2D(64, (3, 3), padding='valid', activation='relu')) out_model.add(layers.MaxPool2D((2, 2))) out_model.add(layers.Conv2D(128, (3, 3), padding='valid', activation='relu')) out_model.add(layers.MaxPool2D((2, 2))) out_model.add(layers.GlobalAveragePooling2D()) out_model.add(layers.Dense(32, activation='relu')) out_model.add(layers.Dense(1, activation='sigmoid')) out_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy']) out_model.summary()",Yes,4,4.0 "from IPython.display import clear_output fit_results = out_model.fit(trn_image, trn_label, validation_data=(vld_image, vld_label), epochs=100) clear_output()",No,5,7.0 "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10)) ax1.plot(fit_results.history['loss'], label='Training') ax1.plot(fit_results.history['val_loss'], label='Validation') ax1.legend() ax1.set_title('Loss History') ax2.plot(100*np.array(fit_results.history['binary_accuracy']), label='Training') ax2.plot(100*np.array(fit_results.history['val_binary_accuracy']), label='Validation') ax2.legend() ax2.set_title('Accuracy History')",No,5,35.0 "# Import the necessary libraries import numpy as np import pandas as pd import os import time import warnings import gc from six.moves import urllib import matplotlib import matplotlib.pyplot as plt warnings.filterwarnings('ignore')",No,5,23.0 "#Add All the Models Libraries # Scalers from sklearn.preprocessing import StandardScaler from sklearn.utils import shuffle from sklearn.pipeline import Pipeline from sklearn.pipeline import FeatureUnion from sklearn.preprocessing import LabelEncoder # Models from sklearn.linear_model import LogisticRegression #logistic regression from sklearn.svm import SVC # Support Vector Classifier from sklearn.ensemble import RandomForestClassifier #Random Forest from sklearn.neighbors import KNeighborsClassifier #KNN from sklearn.ensemble import ExtraTreesClassifier from sklearn.ensemble import VotingClassifier from sklearn.neural_network import MLPClassifier from sklearn.tree import DecisionTreeClassifier #Decision Tree from sklearn.ensemble import BaggingClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split #training and testing data split from sklearn import metrics #accuracy measure from sklearn.metrics import confusion_matrix #for confusion matrix from scipy.stats import reciprocal, uniform from sklearn.ensemble import AdaBoostClassifier # Cross-validation from sklearn.model_selection import KFold #for K-fold cross validation from sklearn.model_selection import cross_val_score #score evaluation from sklearn.model_selection import cross_val_predict #prediction from sklearn.model_selection import cross_validate # GridSearchCV from sklearn.model_selection import GridSearchCV from sklearn.model_selection import RandomizedSearchCV #Common data processors from sklearn.preprocessing import OneHotEncoder, LabelEncoder from sklearn import feature_selection from sklearn import model_selection from sklearn import metrics from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils import check_array from scipy import sparse #Accuracy Score from sklearn.metrics import accuracy_score",No,5,22.0 "# to make this notebook's output stable across runs np.random.seed(123) # To plot pretty figures %matplotlib inline plt.rcParams['axes.labelsize'] = 14 plt.rcParams['xtick.labelsize'] = 12 plt.rcParams['ytick.labelsize'] = 12",No,5,23.0 "train = pd.read_csv(""../input/train_users_2.csv"")
test = pd.read_csv(""../input/test_users.csv"")
id_test = test[\'id\']
labels = train[\'country_destination\'].values
df_train = train.drop([\'country_destination\'], axis=1)
train_flag = df_train.shape[0]",Yes,3,45.0 "#We now concat Training and Test set df_total = pd.concat((df_train, test),axis=0, ignore_index = True)",No,5,11.0 "df_total = df_total.drop(['id','date_first_booking'], axis=1)",No,5,10.0 "#Date Account created - Capture Date, month and year seperately. date_ac = np.vstack(df_total.date_account_created.astype(str).apply(lambda x:list(map(int,x.split('-')))).values) df_total['Day'] = date_ac[:,0] df_total['Month']= date_ac[:,1] df_total['year'] = date_ac[:,2] df_total = df_total.drop(['date_account_created'],axis=1)",Yes,3,11.0 "#Time Stamp first active time_stp = np.vstack(df_total.timestamp_first_active.astype(str) .apply(lambda x: list(map(int,[x[:4],x[4:6],x[6:8],x[8:]]))).values) df_total['tfa_day'] = time_stp[:,0] df_total['tfa_Month'] = time_stp[:,1] df_total['tfa_year'] = time_stp[:,2] df_total = df_total.drop(['timestamp_first_active'],axis=1)",Yes,3,11.0 "forest_class = RandomForestClassifier(random_state = 42) n_estimators = [100, 500] min_samples_split = [10, 20] param_grid_forest = {'n_estimators' : n_estimators, 'min_samples_split' : min_samples_split} rand_search_forest = GridSearchCV(forest_class, param_grid_forest, cv = 4, refit = True, n_jobs = -1, verbose=2) rand_search_forest.fit(final_train_X, train_set_y)",Yes,3,6.0 "df = pd.read_csv('../input/train.csv', parse_dates=[0])",No,5,45.0 "test = pd.read_csv('../input/test.csv', parse_dates=[0])",No,5,45.0 "df.rename(columns={'count': 'rentals'}, inplace=True)",No,5,61.0 df['rentals'] = np.log(df['rentals']),No,5,8.0 import math,No,5,22.0 "pd.concat([df, test])",No,5,11.0 "df = pd.concat([df, test])",No,5,11.0 "df['year'] = df.datetime.dt.year df['hour'] = df.datetime.dt.hour df['dayofweek'] = df.datetime.dt.dayofweek",No,5,8.0 "train, valid = train_test_split(df, random_state=42)",No,5,13.0 "removed_cols = ['rentals', 'casual', 'registered', 'datetime']",No,5,77.0 feats = [c for c in df.columns if c not in removed_cols],No,5,77.0 from sklearn.tree import DecisionTreeRegressor,No,5,22.0 "dt = DecisionTreeRegressor(random_state=42, max_depth=2)",No,5,4.0 "dt.fit(train[feats], train['rentals'])",No,5,7.0 from fastai.structured import draw_tree,No,5,22.0 from sklearn.metrics import mean_squared_error,No,5,22.0 "mean_squared_error(train['rentals'], train['preds'])",No,5,28.0 from sklearn.ensemble import RandomForestRegressor,No,5,22.0 "rf = RandomForestRegressor(random_state=42, n_jobs=-1)",No,5,4.0 "rf.fit(train[feats], train['rentals'])",No,5,7.0 train_preds = rf.predict(train[feats]),No,5,27.0 "mean_squared_error(train['rentals'], train_preds)**(1/2)",No,5,28.0 valid_preds = rf.predict(valid[feats]),No,5,48.0 "mean_squared_error(valid['rentals'], valid_preds)**(1/2)",No,5,49.0 "test[['datetime', 'count']].to_csv('rf.csv', index=False)",No,5,25.0 "# Prepare the train data train_data = process_data(train_images, TRAIN_DIR, isTrain=True) X = np.array([i[0] for i in train_data]).reshape(-1, IMG_SIZE, IMG_SIZE, 3) y = np.array([i[1] for i in train_data])",No,5,21.0 plot_accuracy_and_loss(history),No,5,35.0 "import numpy as np
import pandas as pd
import keras
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from matplotlib import pyplot as plt
import os
import re
print(os.listdir(""../input""))
def to_int(obj):
return int(re.sub(""[^\\d]"", \'\', obj))",No,5,88.0 "df = pd.read_csv('../input/2TWH_train.csv', index_col='IDNum') for col in df: if col[0] == ' ': df = df.rename(columns={col : col[1:]}) dt = pd.read_csv('../input/test.csv', index_col='IDNum') for col in dt: if col[0] == ' ': dt = dt.rename(columns={col : col[1:]})",Yes,3,45.0 "df['Source IP'] = df['Source IP'].apply(to_int) df['Destination IP'] = df['Destination IP'].apply(to_int) df['Timestamp'] = df['Timestamp'].apply(to_int) df['Flow Bytes/s'] = df['Flow Bytes/s'].astype(float) df['Flow Packets/s'] = df['Flow Packets/s'].astype(float) dt['Source IP'] = dt['Source IP'].apply(to_int) dt['Destination IP'] = dt['Destination IP'].apply(to_int) dt['Timestamp'] = dt['Timestamp'].apply(to_int) dt['Flow Bytes/s'] = dt['Flow Bytes/s'].astype(float) dt['Flow Packets/s'] = dt['Flow Packets/s'].astype(float)",No,5,16.0 "df = df.replace([np.inf, 'Infinity', 'infinity', 'inf'], 2**31-1) df = df.replace([np.nan, np.inf, 'NaN'], 0) dt = dt.replace([np.inf, 'Infinity', 'infinity', 'inf'], 2**31-1) dt = dt.replace([np.nan, 'NaN'], 0)",Yes,4,17.0 "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42) ohe = OneHotEncoder(categories='auto') y_train_ohe = ohe.fit_transform(y_train.reshape(-1, 1)) y_test_ohe = ohe.fit_transform(y_test.reshape(-1, 1)) y_ohe = ohe.fit_transform(y.reshape(-1, 1)) scaler = StandardScaler() X_scale = scaler.fit_transform(X.astype(float)) X_train_scale = scaler.fit_transform(X_train.astype(float)) X_test_scale = scaler.transform(X_test.astype(float))",Yes,3,13.0 "from keras import Sequential from keras.layers import Dense model = Sequential() model.add(Dense(units=20, activation='relu', input_dim=X.shape[1])) model.add(Dense(units=10, activation='relu')) model.add(Dense(units=3, activation='softmax')) model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])",No,5,4.0 "history = model.fit(X_train_scale, y_train_ohe, epochs=1000, batch_size=8192, validation_data=(X_test_scale, y_test_ohe))",No,5,7.0 "import matplotlib.pyplot as plt loss = history.history['loss'] val_loss = history.history['val_loss'] epochs = range(1, len(loss) + 1) plt.figure(figsize = (10, 10)) plt.semilogy(epochs, loss, 'bo', label='Training loss') plt.semilogy(epochs, val_loss, 'red', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show()",No,5,35.0 "plt.clf() history_dict = history.history acc_values = history_dict['acc'] val_acc_values = history_dict['val_acc'] plt.figure(figsize = (10, 10)) plt.semilogy(epochs, acc_values, 'bo', label='Training acc') plt.semilogy(epochs, val_acc_values, 'red', label='Validation acc') plt.title('Training and validation accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show()",No,5,35.0 "# Load packages import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from tqdm import tqdm from random import shuffle import os, gc, time, cv2, random, math %matplotlib inline import warnings warnings.filterwarnings('ignore') #################### # Global Constants # #################### INCEPTION_V3_WEIGHTS_PATH = '../input/inceptionv3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5' PATH = '../input/dogs-vs-cats-redux-kernels-edition/' TRAIN_DIR = PATH+'train' TEST_DIR = PATH+'test' NUM_CLASSES = 2 IMG_SIZE = 145 ### CHANNELS = 3 EPOCHS = 30 BATCH_SIZE = 32 train_images = os.listdir(TRAIN_DIR) test_images = os.listdir(TEST_DIR) # # For testing purposes # train_images = train_images[:10000] # test_images = test_images[:100]",No,4,77.0 "# Plotting loss and accuracy for the model def plot_accuracy_and_loss(history): eval_res = pd.DataFrame(history.history) f, ax = plt.subplots(1,2, figsize=(18,5)) for i, c in enumerate(['acc', 'loss']): ax[i].plot(eval_res[[c]], label=f'Training {c}') ax[i].plot(eval_res[[f'val_{c}']], label=f'Validation {c}') ax[i].set_xlabel('Epoch'); ax[i].set_ylabel(c); ax[i].legend(); ax[i].set_title(f'Training and validation {c}'); plt.grid(); plt.show() plot_accuracy_and_loss(history)",No,5,35.0 "import os print(os.listdir(""../input""))",No,5,88.0 "import pandas as pd from pathlib import Path from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression",No,5,22.0 "data_dir = Path('../input') train_df = pd.read_csv(data_dir / 'train.csv') test_df = pd.read_csv(data_dir / 'test.csv') sample_submission = pd.read_csv(data_dir / 'sampleSubmission.csv')",No,5,45.0 "print(train_df.shape) train_df.head()",Yes,3,58.0 "print(test_df.shape) test_df.head()",Yes,3,58.0 "features = ['open', 'high', 'low', 'close', 'volume', 'trades', 'macd', 'macd_hist', 'macd_signal', 'adx', 'di_plus', 'di_minus', 'rsi', 'cci', 'adl']",No,5,77.0 "X_train = train_df[features] y_train = train_df['y'] X_test = test_df[features]",No,5,21.0 "scaler = StandardScaler(with_std=False) X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test)",No,5,18.0 "regressor = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1, max_depth=7) regressor.fit(X_train, y_train)",No,5,7.0 y_test = regressor.predict(X_test),No,5,48.0 "sample_submission.to_csv('submission.csv', index=False)",No,5,25.0 "import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import os print(os.listdir(""../input""))",No,5,88.0 "import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression as LR from sklearn.metrics import mean_squared_error import lightgbm as lgb from sklearn.model_selection import GridSearchCV from sklearn.model_selection import RandomizedSearchCV",No,5,22.0 "import warnings warnings.filterwarnings(""ignore"")",No,5,23.0 "data = pd.read_csv('../input/train.csv') target = data.pop('y') data_test = pd.read_csv('../input/test.csv')",No,5,45.0 "out_df = pd.DataFrame(np.concatenate(preds_df), columns=['id','expected']) out_df.id = out_df.id.astype('int') out_df.to_csv('by_asset_lgbm.csv',index = False)",Yes,4,25.0 "out_df = pd.DataFrame(np.concatenate(grid_preds_df), columns=['id','expected']) out_df.id = out_df.id.astype('int') out_df.to_csv('grid_by_asset.csv',index = False)",Yes,4,25.0 "out_df = pd.DataFrame(np.concatenate(preds_df), columns=['id','expected']) out_df.id = out_df.id.astype('int') out_df.to_csv('Elastic_net.csv',index = False)",Yes,4,25.0 "data_dir = Path('../input') train_df = pd.read_csv(data_dir / 'train.csv') test_df = pd.read_csv(data_dir / 'test.csv') sample_submission = pd.read_csv(data_dir / 'sampleSubmission.csv') train_df['d'] = train_df['close'] / train_df['open'] test_df['d'] = test_df['close'] / test_df['open']",Yes,4,45.0 " features = ['asset','di_minus', 'rsi', 'cci','volume'] features = ['asset', 'open', 'high', 'low', 'close', 'volume', 'trades', 'macd', 'macd_hist', 'macd_signal', 'adx', 'di_plus', 'di_minus', 'rsi', 'cci', 'adl']",No,5,77.0 "X_train = train_df[features] y_train = train_df[['asset', 'y']] X_test = test_df[features]",No,5,21.0 "from numpy import column_stack scaler = StandardScaler() X_train = pd.DataFrame(data=scaler.fit_transform(train_df[features]), columns=features) X_train['asset'] = train_df['asset'] X_test = pd.DataFrame(data=scaler.transform(test_df[features]), columns=features) X_test['asset'] = test_df['asset']",No,5,21.0 "regressor = TssRegressor() regressor.fit(X_train, y_train) y_test = [item[1] for item in regressor.predict(X_test)] y_test[:12] = [0]*12 sample_submission['expected'] = y_test",Yes,3,7.0 "import pylab import calendar import numpy as np import pandas as pd import seaborn as sn from scipy import stats import missingno as msno from datetime import datetime import matplotlib.pyplot as plt import warnings pd.options.mode.chained_assignment = None warnings.filterwarnings(""ignore"", category=DeprecationWarning) %matplotlib inline",No,5,23.0 "dailyData = pd.read_csv(""../input/train.csv"")",No,5,45.0 dailyData.shape,No,5,58.0 dailyData.head(2),No,5,41.0 dailyData.dtypes,No,5,70.0 "dailyData[""date""] = dailyData.datetime.apply(lambda x : x.split()[0])
dailyData[""hour""] = dailyData.datetime.apply(lambda x : x.split()[1].split("":"")[0])
dailyData[""weekday""] = dailyData.date.apply(lambda dateString : calendar.day_name[datetime.strptime(dateString,""%Y-%m-%d"").weekday()])
dailyData[""month""] = dailyData.date.apply(lambda dateString : calendar.month_name[datetime.strptime(dateString,""%Y-%m-%d"").month])
dailyData[""season""] = dailyData.season.map({1: ""Spring"", 2 : ""Summer"", 3 : ""Fall"", 4 :""Winter"" })
dailyData[""weather""] = dailyData.weather.map({1: "" Clear + Few clouds + Partly cloudy + Partly cloudy"",\\
2 : "" Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist "", \\
3 : "" Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds"", \\
4 :"" Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog "" })",No,4,8.0 "dailyData = dailyData.drop([""datetime""],axis=1)",No,5,10.0 "fig, axes = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(12, 10)
sn.boxplot(data=dailyData,y=""count"",orient=""v"",ax=axes[0][0])
sn.boxplot(data=dailyData,y=""count"",x=""season"",orient=""v"",ax=axes[0][1])
sn.boxplot(data=dailyData,y=""count"",x=""hour"",orient=""v"",ax=axes[1][0])
sn.boxplot(data=dailyData,y=""count"",x=""workingday"",orient=""v"",ax=axes[1][1])

axes[0][0].set(ylabel=\'Count\',title=""Box Plot On Count"")
axes[0][1].set(xlabel=\'Season\', ylabel=\'Count\',title=""Box Plot On Count Across Season"")
axes[1][0].set(xlabel=\'Hour Of The Day\', ylabel=\'Count\',title=""Box Plot On Count Across Hour Of The Day"")
axes[1][1].set(xlabel=\'Working Day\', ylabel=\'Count\',title=""Box Plot On Count Across Working Day"")",No,5,33.0 "dailyDataWithoutOutliers = dailyData[np.abs(dailyData[""count""]-dailyData[""count""].mean())<=(3*dailyData[""count""].std())] ",No,5,14.0 "print (""Shape Of The Before Ouliers: "",dailyData.shape) print (""Shape Of The After Ouliers: "",dailyDataWithoutOutliers.shape)",No,5,58.0 "dataTrain = pd.read_csv(""../input/train.csv"") dataTest = pd.read_csv(""../input/test.csv"")",No,5,45.0 "data = dataTrain.append(dataTest) data.reset_index(inplace=True) data.drop('index',inplace=True,axis=1)",Yes,4,10.0 "data[""date""] = data.datetime.apply(lambda x : x.split()[0]) data[""hour""] = data.datetime.apply(lambda x : x.split()[1].split("":"")[0]).astype(""int"") data[""year""] = data.datetime.apply(lambda x : x.split()[0].split(""-"")[0]) data[""weekday""] = data.date.apply(lambda dateString : datetime.strptime(dateString,""%Y-%m-%d"").weekday()) data[""month""] = data.date.apply(lambda dateString : datetime.strptime(dateString,""%Y-%m-%d"").month)",No,5,8.0 "categoricalFeatureNames = [""season"",""holiday"",""workingday"",""weather"",""weekday"",""month"",""year"",""hour""]
numericalFeatureNames = [""temp"",""humidity"",""windspeed"",""atemp""]
dropFeatures = [\'casual\',""count"",""datetime"",""date"",""registered""]",No,5,77.0 "for var in categoricalFeatureNames: data[var] = data[var].astype(""category"")",No,5,16.0 "dataTrain = dataTrain.drop(dropFeatures,axis=1) dataTest = dataTest.drop(dropFeatures,axis=1)",No,5,10.0 "from sklearn.linear_model import LinearRegression,Ridge,Lasso from sklearn.model_selection import GridSearchCV from sklearn import metrics import warnings pd.options.mode.chained_assignment = None warnings.filterwarnings(""ignore"", category=DeprecationWarning) # Initialize logistic regression model lModel = LinearRegression() # Train the model yLabelsLog = np.log1p(yLabels) lModel.fit(X = dataTrain,y = yLabelsLog) # Make predictions preds = lModel.predict(X= dataTrain) print (""RMSLE Value For Linear Regression: "",rmsle(np.exp(yLabelsLog),np.exp(preds),False))",Yes,3,7.0 "from sklearn.ensemble import GradientBoostingRegressor gbm = GradientBoostingRegressor(n_estimators=4000,alpha=0.01); ### Test 0.41 yLabelsLog = np.log1p(yLabels) gbm.fit(dataTrain,yLabelsLog) preds = gbm.predict(X= dataTrain) print (""RMSLE Value For Gradient Boost: "",rmsle(np.exp(yLabelsLog),np.exp(preds),False))",Yes,3,7.0 "predsTest = gbm.predict(X= dataTest) fig,(ax1,ax2)= plt.subplots(ncols=2) fig.set_size_inches(12,5) sn.distplot(yLabels,ax=ax1,bins=50) sn.distplot(np.exp(predsTest),ax=ax2,bins=50)",No,5,56.0 "submission = pd.DataFrame({
""datetime"": datetimecol,
""count"": [max(0, x) for x in np.exp(predsTest)]
})
submission.to_csv(\'bike_predictions_gbm_separate_without_fe.csv\', index=False)",No,5,25.0 "import math import numpy as np import pandas as pd import matplotlib.pyplot as plt import time import os %matplotlib inline",No,5,23.0 train = pd.read_csv('../input/training/training.csv'),No,5,45.0 train.dropna(inplace=True),No,5,17.0 train.tail(3),No,5,41.0 test = pd.read_csv('../input/test/test.csv'),No,5,45.0 "train.shape, test.shape",No,5,58.0 "x = np.stack(train.Image)[..., None]",No,5,11.0 x.shape,No,5,58.0 "x_t = np.stack(test.Image)[..., None]",No,5,11.0 x_t.shape,No,5,58.0 "from IPython.display import SVG from keras.models import Sequential from keras.layers import Dense, Activation, Dropout, Conv2D, MaxPool2D, Flatten, LeakyReLU, ELU from keras.callbacks import ModelCheckpoint, EarlyStopping from keras.utils.vis_utils import model_to_dot",No,5,22.0 "np.random.seed(777) model10 = Sequential() model10.add(Conv2D(filters = 64, kernel_size = (5,5), padding = 'Same', activation = 'relu', input_shape = (96, 96, 1))) model10.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) model10.add(Dropout(0.3)) model10.add(Conv2D(filters = 32, kernel_size = (4,4), padding = 'Same', activation = 'relu')) model10.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) model10.add(Dropout(0.5)) model10.add(Flatten()) model10.add(Dense(128, activation = 'relu')) model10.add(Dropout(0.7)) model10.add(Dense(30, activation = 'relu'))",No,5,4.0 MODEL_DIR = '../model/',No,5,77.0 modelpath = '../model/{epoch:02d}-{val_loss:4f}.hdf5',No,5,77.0 predict = model10.predict(x),No,5,27.0 "train_loss = history.history['loss'] val_loss = history.history['val_loss'] x_len = np.arange(len(train_loss)) plt.plot(x_len, train_loss, marker='.', c='red', label='Train_loss') plt.plot(x_len, val_loss, marker='.', c='blue', label='Val_loss') plt.legend(loc='upper right') plt.grid() plt.xlabel('epoch') plt.ylabel('loss') plt.show()",No,5,35.0 y_t = model10.predict(x_t),No,5,48.0 look_id = pd.read_csv('../input/IdLookupTable.csv'),No,5,45.0 look_id.info(),No,5,40.0 "look_id.drop('Location', axis=1, inplace=True)",No,5,10.0 "look_id[['RowId','Location']].to_csv('Predict.csv',index=False)",No,5,25.0 "from xgboost import XGBClassifier import xgboost as xgb",No,5,22.0 "#for scaling from sklearn.preprocessing import StandardScaler",No,5,22.0 "data = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') ",No,5,45.0 print(data.shape),No,5,58.0 data.columns,No,5,71.0 "df=pd.concat([df,Date,Time],axis=1) #df #-----------test data------------------ df_test=pd.concat([df_test,Date_test,Time_test],axis=1) ",No,5,11.0 "df=df.drop(labels=['Dates'],axis=1) #-----------test data------------------ df_test=df_test.drop(labels=['Dates'],axis=1) ",No,5,10.0 df.columns,No,5,71.0 "df[""rot60_X""]=(0.5) * df[""Y""] + (1.732/2) * df[""X""] df[""rot60_Y""]=0.5 * df[""Y""] - (1.732/2) * df[""X""] df_test[""rot60_X""]=(0.5) * df_test[""Y""] + (1.732/2) * df_test[""X""] df_test[""rot60_Y""]=0.5 * df_test[""Y""] - (1.732/2) * df_test[""X""] df[""rot45_X""]=0.707 * df[""Y""] + 0.707 * df[""X""] df[""rot45_Y""]=0.707 * df[""Y""] - 0.707 * df[""X""] df_test[""rot45_X""]=0.707 * df_test[""Y""] + 0.707 * df_test[""X""] df_test[""rot45_Y""]=0.707 * df_test[""Y""] - 0.707 * df_test[""X""] df[""rot30_X""]=(1.732/2) * df[""Y""] + 0.5 * df[""X""] df[""rot30_Y""]=(1.732/2) * df[""Y""] - 0.5 * df[""X""] df_test[""rot30_X""]=(1.732/2) * df_test[""Y""] + 0.5 * df_test[""X""] df_test[""rot30_Y""]=(1.732/2) * df_test[""Y""] - 0.5 * df_test[""X""] ",No,5,8.0 "df[""radial60""]=np.sqrt(np.power(df[\'rot60_X\'],2) + np.power(df[\'rot60_Y\'],2))

df_test[""radial60""]=np.sqrt(np.power(df_test[\'rot60_X\'],2) + np.power(df_test[\'rot60_Y\'],2))",No,5,8.0 "df=df.drop(labels='rot60_X',axis=1) df_test=df_test.drop(labels='rot60_X',axis=1)",No,5,10.0 "df=df.drop(labels='rot60_Y',axis=1) df_test=df_test.drop(labels='rot60_Y',axis=1)",No,5,10.0 "df=df.drop(labels='Second',axis=1) df_test=df_test.drop(labels='Second',axis=1)",No,5,10.0 "df['Minute']=df['Minute'].apply(lambda x:int(x)) df['Minute']=df['Minute'].apply(lambda x : 'low' if x <31 else 'high') df_test['Minute']=df_test['Minute'].apply(lambda x:int(x)) df_test['Minute']=df_test['Minute'].apply(lambda x : 'low' if x <31 else 'high') ",No,5,8.0 "df['DayOfWeek']= df['DayOfWeek'].apply(lambda x : 'WeekHigh' if x in ('Wednesday','Friday') else ('WeekMed' if x in ('Tuesday','Thursday','Saturday') else 'WeekLow')) df_test['DayOfWeek']= df_test['DayOfWeek'].apply(lambda x : 'WeekHigh' if x in ('Wednesday','Friday') else ('WeekMed' if x in ('Tuesday','Thursday','Saturday') else 'WeekLow')) ",No,5,8.0 "df['Intersection']=df['Address'].apply(lambda x : 1 if '/' in x else 0) df['Block']=df['Address'].apply(lambda x : 1 if 'Block' in x else 0) df_test['Intersection']=df_test['Address'].apply(lambda x : 1 if '/' in x else 0) df_test['Block']=df_test['Address'].apply(lambda x : 1 if 'Block' in x else 0)",No,5,8.0 "Id=df['Id'] df=df.drop(['Descript','Resolution','Id'],axis=1) #----------test data--------- Id_test=df_test['Id'] df_test=df_test.drop(['Descript','Resolution','Id'],axis=1)",No,5,10.0 "lasso = linear_model.Lasso(alpha=0.1) score1 = cross_val_score(lasso, X_train, y_train, cv=10) score1.mean()",No,5,28.0 "ridge = linear_model.Ridge(alpha=0.1) score2 = cross_val_score(ridge, X_train, y_train, cv=10) score2.mean()",No,5,28.0 from sklearn.neighbors import KNeighborsRegressor,No,5,22.0 "score3 = [] neighbor = [] for k in range(10, 100, 10): knn = KNeighborsRegressor(n_neighbors=k, weights='distance') score3.append(cross_val_score(knn, X_train, y_train, cv=10).mean()) neighbor.append(k)",No,5,2.0 score3,No,5,53.0 "plt.plot(neighbor, score3, 'ro')",No,5,33.0 "score3 = [] neighbor = [] for k in range(1, 32, 2): knn = KNeighborsRegressor(n_neighbors=k, weights='distance') score3.append(cross_val_score(knn, X_train, y_train, cv=10).mean()) neighbor.append(k)",No,5,84.0 neighbor,No,5,41.0 "knn = KNeighborsRegressor(n_neighbors=21) score3 = cross_val_score(knn, X_train, y_train, cv=10) score3.mean()",No,5,28.0 "score4 = [] trees = [] for n in range(10,101,10): regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=n) score4.append(cross_val_score(regr, X_train, y_train, cv=10).mean()) trees.append(n)",No,2,27.0 "plt.plot(trees, score4, 'ro')",No,5,56.0 "regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=80) score4.append(cross_val_score(regr, X_train, y_train, cv=10).mean())",No,5,28.0 score4[-1],No,5,41.0 "df[""raw_radial""]=np.sqrt(np.power(df[\'X\'],2) + np.power(df[\'Y\'],2))

df_test[""raw_radial""]=np.sqrt(np.power(df_test[\'X\'],2) + np.power(df_test[\'Y\'],2))",No,5,8.0 test = pd.read_csv('../input/datasetss/test.csv'),No,5,45.0 "le_res=le.fit_transform(df['Category']) cat=pd.DataFrame(le_res) cat.columns=['Category'] df=df.drop(labels=['Category'],axis=1) df=pd.concat([cat,df],axis=1) df.columns",Yes,3,20.0 "X_test = test.drop('Id',axis=1)",No,5,10.0 "lasso.fit(X_train, y_train)",No,5,7.0 prediction1 = lasso.predict(X_test),No,5,48.0 df_test.columns,No,5,71.0 "df=df[['Address', 'Minute', 'Hour', 'Day', 'Month', 'Year', 'District', 'DayOfWeek', 'X', 'Y', 'rot45_X', 'rot45_Y', 'rot30_X', 'rot30_Y', 'radial60', 'Intersection', 'Block', 'raw_radial', 'closest_centers_f', 'label']] df_test=df_test[['Address', 'Minute', 'Hour', 'Day', 'Month', 'Year', 'District', 'DayOfWeek', 'X', 'Y', 'rot45_X', 'rot45_Y', 'rot30_X', 'rot30_Y', 'radial60', 'Intersection', 'Block', 'raw_radial', 'closest_centers_f', 'label']]",No,5,10.0 "prediction1 = abs(prediction) prediction1[0]",Yes,4,55.0 "prediction = pd.DataFrame({'Id':test.Id,'median_house_value': prediction1[0]})",No,5,55.0 prediction,No,5,41.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegressionCV,SGDClassifier
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer
from sklearn.model_selection import train_test_split
# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from sklearn.svm import SVC
import os
print(os.listdir(""../input""))
from wordcloud import WordCloud
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
# Any results you write to the current directory are saved as output.",No,5,88.0 train_df = pd.read_csv('../input/train.csv'),No,5,45.0 train_df.head(),No,5,41.0 "train_df['president'].value_counts().plot(kind = 'bar') plt.show()",No,5,33.0 "#Independent Column X=df X.shape ",No,5,58.0 "#Dependent X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,shuffle=False)",No,5,13.0 "train_data = [] for i, row in train_df.iterrows(): for text in row['text'].split('.'): train_data.append([row['president'], text]) train_data = pd.DataFrame(train_data, columns=['president', 'text'])",No,5,12.0 "train_data['president'].value_counts().plot(kind = 'bar') plt.show()",No,5,33.0 train_data.head(),No,5,41.0 "def remove_punctuation_numbers(text): punc_numbers = string.punctuation + '0123456789' return ''.join([l for l in text if l not in punc_numbers])",No,5,78.0 "def tokeniser(text): return TreebankWordTokenizer().tokenize(text)",No,5,78.0 "def lemmetizer(tokens): wordnet_lemmatizer = WordNetLemmatizer() return [wordnet_lemmatizer.lemmatize(word) for word in tokens]",No,5,84.0 "def remove_stop_words(tokens): return [t for t in tokens if t not in set(stopwords.words('english'))]",No,5,84.0 "def data_cleaner(text): text = text.lower() text = remove_punctuation_numbers(text) lst = tokeniser(text) lst = remove_stop_words(lst) return ' '.join(lemmetizer(lst))",No,5,78.0 train_data['clean_text'] = train_data['text'].apply(data_cleaner),No,5,8.0 "for pres in train_data['president'].unique(): words =[] for sentence in train_data[train_data['president'] == pres].clean_text: words.extend(tokeniser(sentence)) wordcloud = WordCloud().generate_from_frequencies(frequencies=Counter(words)) plt.figure(figsize=(12,8)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.title(pres) plt.show()",No,5,53.0 "train_data.president = train_data.president.map({'deKlerk':0,'Mandela':1, 'Mbeki':2, 'Motlanthe':3, 'Zuma': 4, 'Ramaphosa':5})",No,5,20.0 "X = train_data.clean_text y = train_data.president X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0, stratify=y)",Yes,4,13.0 "vect = CountVectorizer(ngram_range=(1,2))",No,5,84.0 X_train_ = vect.fit_transform(X_train),No,5,8.0 "log = LogisticRegressionCV(dual=False, penalty='l2', multi_class='multinomial')",No,5,4.0 "log.fit(X_train_, y_train)",No,5,7.0 "print(accuracy_score(y_train, log.predict(X_train_)))",No,5,28.0 "print(accuracy_score(y_test, log.predict(vect.transform(X_test))))",No,5,49.0 test_data = pd.read_csv('../input/test.csv'),No,5,45.0 test_data.head(),No,5,41.0 test_data.text = test_data.text.apply(data_cleaner),No,5,8.0 test_data['president'] = log.predict(vect.transform(test_data.text)),No,5,48.0 "test_data.drop('text', axis=1, inplace=True,)",No,5,10.0 "test_data.to_csv('Thapelo_log.csv',index=False)",No,5,25.0 "import pandas as pd import numpy as np import string import re import seaborn as sns import matplotlib.pyplot as plt sns.set(style='whitegrid', palette='muted', rc={'figure.figsize': (15,10)})",Yes,4,22.0 "print(train.shape, test.shape)",No,5,58.0 "pres = {'deKlerk': 0, 'Mandela': 1, 'Mbeki': 2, 'Motlanthe': 3, 'Zuma': 4, 'Ramaphosa': 5} train.replace({'president': pres}, inplace=True)",No,5,20.0 "# speech number: intro lines starts = { 0: 1, 1: 1, 2: 1, 3: 12, 4: 12, 5: 5, 6: 1, 7: 1, 8: 8, 9: 9, 10: 12, 11: 14, 12: 14, 13: 15, 14: 15, 15: 15, 16: 15, 17: 15, 18: 15, 19: 15, 20: 20, 21: 1, 22: 15, 23: 20, 24: 20, 25: 15, 26: 15, 27: 20, 28: 20, 29: 15, 30: 18 }",No,5,77.0 "def divide_on(df, char): # iterate over text column of DataFrame, splitting at each occurrence of char sentences = [] # let's split the data into senteces for i, row in df.iterrows(): # skip the intro lines of the speech for sentence in row['text'].split(char)[starts[i]:]: sentences.append([row['president'], sentence]) df = pd.DataFrame(sentences, columns=['president', 'text']) return df[df['text'] != '']",No,5,78.0 "train = divide_on(train, '.')",No,5,53.0 train.head(5),No,5,41.0 train['president'].value_counts(),No,5,72.0 "# proportion of total train['president'].value_counts()/train.shape[0]",No,5,72.0 "train['sentence'] = None test['president'] = None df = pd.concat([train, test], axis=0, sort=False)",No,5,11.0 "# reorder columns df = df[['sentence', 'text', 'president']]",No,5,10.0 df.tail(),No,5,41.0 "def fixup(text):

# remove punctuation
text = \'\'.join([char for char in text if char == \'-\' or char not in string.punctuation])
# remove special characters
text = text.replace(r\'^[*-]\', \'\')
# remove numbers
text = \'\'.join([char for char in text if not char.isdigit()])
# lowercase
text = text.lower()

# remove hanging whitespace
text = "" "".join(text.split())

return text


df[\'text\'] = df[\'text\'].apply(fixup)",No,5,78.0 "# get length of sentence as variable df['length'] = df['text'].apply(len)",No,5,8.0 "# what are our longest sentences? df.sort_values(by='length', ascending=False).head(10)",No,5,41.0 df.loc[3930][1],No,5,14.0 "# what are our shortest sentences? df.sort_values(by='length').head(5)",No,5,41.0 "# let's check the shortest sentences in our test set df[pd.isnull(df['president'])].sort_values(by='length').head()",No,5,41.0 "# sentences with just a few characters are of no use to us df = df[df['length']>10]",No,5,14.0 "# what are our shortest sentences now? df.sort_values(by='length').head(5)",No,5,41.0 df['president'].value_counts(),No,5,72.0 "from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from keras.wrappers.scikit_learn import KerasClassifier from keras.models import Sequential from keras.layers import Dense, Activation, Dropout from sklearn.preprocessing import LabelEncoder from sklearn.feature_extraction.text import TfidfVectorizer",No,5,22.0 "tfidf = TfidfVectorizer(strip_accents='unicode', ngram_range=(1,3), stop_words='english', min_df=6) X = tfidf.fit_transform(df['text']).todense() X.shape",Yes,4,8.0 tfidf.get_feature_names(),No,5,53.0 "X = pd.DataFrame(data=X, columns=tfidf.get_feature_names())",No,5,21.0 "df = df.drop(columns=['text', 'length'], axis=1)",No,5,10.0 "X = pd.DataFrame(np.hstack((df, X)))",No,5,12.0 "HYPER_PARAMS = { 'learning_rate': 0.02, 'n_estimators':800, 'max_depth': 6, 'subsample': 0.8, 'colsample_bytree': 0.8, 'max_delta_step': 1, 'objective': 'multi:softmax', 'nthread': 4, 'seed': 1747 } model = xgb.XGBClassifier(**HYPER_PARAMS) model.fit(X,y) ",No,5,7.0 "X.columns = ['sentence_id', 'president_id'] + tfidf.get_feature_names()",No,5,61.0 y_pred=model.predict_proba(df_test),No,5,48.0 "train = X[pd.isnull(X['sentence_id'])] test = X[pd.notnull(X['sentence_id'])]",No,5,14.0 "X_train = train.drop(['sentence_id', 'president_id'], axis=1) X_test = test.drop(['sentence_id', 'president_id'], axis=1)",No,5,10.0 "y_pred= pd.DataFrame(y_pred, index=Id_test,columns = le.classes_)",No,5,12.0 "y_pred.to_csv(""submit.csv"", float_format = \'%.5F\')",No,5,25.0 "def one_hot_encode(label): # initialize zero array vec = [0, 0, 0, 0, 0, 0] # set index of array corresponding to label = 1 vec[label] = 1 return vec # save encoded labels as target for model y_train = np.vstack(row for row in train['president_id'].apply(one_hot_encode).values)",No,5,20.0 y_train[600],No,5,41.0 "print('Train size:', X_train.shape) print('Test size:', X_test.shape)",No,5,58.0 "# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here\'s several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the ""../input/"" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir(""../input""))

# Any results you write to the current directory are saved as output.",No,5,88.0 "# importing the modulus from keras.preprocessing.image import ImageDataGenerator,img_to_array,load_img from keras.models import Sequential from keras.layers import Dropout, Flatten, Dense , Activation from keras import applications import re",No,5,22.0 "def create_model(lyrs=[X_train.shape[1], 1028, 512, 256], act='relu', opt='Adam', dr=0.25): model = Sequential() # create first hidden layer model.add(Dense(lyrs[0], input_dim=X_train.shape[1], activation=act)) # create additional hidden layers for i in range(1,len(lyrs)): model.add(Dense(lyrs[i], activation=act)) # add dropout, default is none model.add(Dropout(dr)) # create output layer model.add(Dense(6, activation='softmax')) # output layer model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) return model",No,5,4.0 "model = create_model() print(model.summary())",No,5,84.0 "# train model on full train set, with 80/20 CV split
training = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)
val_acc = np.mean(training.history[\'val_acc\'])
print(""\
%s: %.2f%%"" % (\'val_acc\', val_acc*100))",Yes,4,7.0 "# summarize history for accuracy plt.plot(training.history['acc']) plt.plot(training.history['val_acc']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'validation'], loc='upper left') plt.show()",No,5,35.0 predictions = model.predict(X_test),No,5,48.0 "pred_lbls = [] for pred in predictions: pred = list(pred) max_value = max(pred) max_index = pred.index(max_value) pred_lbls.append(max_index) predictions = np.array(pred_lbls)",No,5,55.0 predictions.shape,No,5,58.0 test['president_id'] = predictions,No,5,8.0 test['president_id'].value_counts(),No,5,72.0 "submission = test[['sentence_id','president_id']] submission.columns = ['sentence', 'president'] submission.to_csv('rnn_1.csv', index=False)",No,5,25.0 submission.president.value_counts(),No,5,72.0 "#Dataframes etc import pandas as pd import numpy as np #Visualization import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline from matplotlib.colors import ListedColormap from pylab import rcParams rcParams['figure.figsize'] = 10, 8 sns.set_style('whitegrid') #Machine learning: from sklearn import preprocessing ## ML Cross validation and metrics from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split from sklearn import metrics ## ML models from sklearn.linear_model import LogisticRegression #Natural language processing import nltk #nltk.download('punkt') from nltk.tokenize import word_tokenize, TreebankWordTokenizer ## to automate the NLP extraction from sklearn.feature_extraction.text import CountVectorizer ",No,5,23.0 "df=pd.read_csv('../input/train.csv') df.head() ",Yes,4,45.0 df['president'].unique(),No,5,57.0 "df.info() #no nulls, nothing unexpected",No,5,40.0 " dict = {'deKlerk': 0, 'Mandela': 1, 'Mbeki': 2, 'Motlanthe': 3, 'Zuma': 4, 'Ramaphosa': 5} ",No,5,77.0 "df['presi_num']= df['president'] df['presi_num']=df['presi_num'].replace(dict) df.head()",Yes,4,8.0 "df['speech length']=df['text'].str.len() df.head()",Yes,4,8.0 "df_summary = pd.DataFrame(df.groupby('president')['speech length'].mean()) df_summary=df_summary.reset_index() df_summary = df_summary.sort_values(by='speech length') sns.barplot(data = df_summary, x='president', y='speech length')",No,5,33.0 "df_summary['speech_length_%']=df_summary['speech length'].div(df_summary['speech length'].sum(), axis=0).multiply(100) df_summary.head()",Yes,4,8.0 list(df_summary['speech_length_%']),No,5,41.0 "df_summary['presi_num']= df_summary['president'] df_summary['presi_num']= df_summary['presi_num'].replace(dict) df_summary.sort_values(by='presi_num', ascending = True) df_summary",Yes,4,8.0 "#check list(df_summary['presi_num'])",No,5,41.0 "import string def remove_punctuation_numbers(post): punc_numbers = string.punctuation + '0123456789' return ''.join([l for l in post if l not in punc_numbers]) ",No,5,78.0 # from sklearn.model_selection import GridSearchCV\n# from xgboost import XGBRegressor\n\n# y_Train = X_Train.ConfirmedCases\n# hyperParam = {\,No,5,76.0 \,No,5,76.0 #print (type(Y_train_CC))\n#X_train_CC.info(),No,5,76.0 "#print (X_train_CC.shape, X_train_Fat.shape, X_test.shape)",No,5,76.0 "#TODO: check duplicates,missing numeric, string, typo.",No,5,76.0 "#cols_with_missing = [col for col in X_train.columns \n# if X_train[col].isnull().any()]\n#X_train = X_train.drop(cols_with_missing, axis=1)\n#X_test = X_test.drop(cols_with_missing, axis=1)",No,5,76.0 #NO NULL,No,5,76.0 #gbr = GradientBoostingRegressor(random_state=17),No,5,76.0 # \n\n# df_train.isna().sum()\n# df_test.isna().sum()\n\n# ,No,5,76.0 # \n\n# pd.DataFrame(df_train).dtypes,No,5,76.0 "#dftrain.groupby('City')['revenue'].agg(['count','mean'])",No,5,76.0 "#P1 int testset has no 7,8,10,11,13,14,16 values skew 1.8\n#P2 FLOAT skew 0.03 test set only few decima values between 1 and 2, 4 and 5, \n#P3 float skew 0.14 test set only some decimal values between 4 and 5\n#P10 int64 skew 1.70 test set 80% values are 5, 15% values are 10, test set exactly like train set.\n#P22 int64 skew 0.79 test set 25% values are 1, 20% 2, 15% 3 , 12% 4, 9% 5 gradually decreasing distribution\n#P23 int64 skew 3.24 test set 40% values are 1, 15% 2, 12% 3, 9% 4, 14% 5, 3-4% each 10 15 20 25\n#P10 train and test exactly thesame",No,5,76.0 "#no log transform = P2, P3, P7,",No,5,76.0 #for col in dftrain.columns:\n # if (dftrain[col].dtype == int) | (dftrain[col].dtype == float):\n # print (col)\n # print (skew(dftrain[col]))\n # print (skew(np.log1p(dftrain[col]))),No,5,76.0 # import pandas_profiling as pdp\n# pdp.ProfileReport(train),No,5,76.0 "# city_rev = []\n\n# for i in train['City']:\n# for key, value in mean_dict.items():\n# if i == key:\n# city_rev.append(value)\n \n# df_city_rev = pd.DataFrame({'city_rev':city_rev})\n# train = pd.concat([train,df_city_rev],axis=1)\n# train.head()",No,5,76.0 "# train.replace({""City"":mean_dict}, inplace=True)\n# test.replace({""City"":mean_dict}, inplace=True)\n# test[\",No,5,76.0 # train.iloc[list(tukey_outliers(df_num.acceleration).index)],No,5,76.0 "# \n# for i in range(len(num_list)):\n# # \n# upper_lim = full_data[num_list[i]].quantile(.95)\n# lower_lim = full_data[num_list[i]].quantile(.05)\n \n# # IQR\n# Q1 = full_data[num_list[i]].quantile(.25)\n# Q3 = full_data[num_list[i]].quantile(.75)\n# IQR = Q3 - Q1\n# outlier_step = 1.5 * IQR\n \n# # 1.5IQR95%tile5%tile\n# full_data.loc[(full_data[num_list[i]] > (Q3 + outlier_step)), num_list[i]] =upper_lim\n# full_data.loc[(full_data[num_list[i]] < (Q1 - outlier_step)), num_list[i]] = lower_lim",No,5,76.0 "# columns = len(num_list)/4+1\n\n# # boxplot\n# fig = plt.figure(figsize=(15,20))\n# plt.subplots_adjust(hspace=0.2, wspace=0.8)\n# for i in range(len(num_list)):\n# ax = fig.add_subplot(columns, 4, i+1)\n# sns.boxplot(y=full_data[num_list[i]], data=full_data, ax=ax)\n# plt.show()",No,5,76.0 "# skew_col = skewed_data[skewed_data > 10].index\n\n# # \n# fig = plt.figure(figsize=(10, 8))\n# for i in range(len(skew_col)):\n# ax = fig.add_subplot(2, 3, i+1)\n# try:\n# sns.distplot(combined_df[skew_col[i]], fit=norm, ax=ax)\n# except:\n# # kdekde=False\n# sns.distplot(combined_df[skew_col[i]], fit=norm, kde=False, ax=ax)\n# plt.show()\n\n# # \n# for i in range(len(skew_col)):\n# combined_df[skew_col[i]] = np.log1p(combined_df[skew_col[i]])\n \n# # \n# # \n# fig = plt.figure(figsize=(10, 8))\n# for i in range(len(skew_col)):\n# ax = fig.add_subplot(2, 3, i+1)\n# try:\n# sns.distplot(combined_df[skew_col[i]], fit=norm, ax=ax)\n# except:\n# # kdekde=False\n# sns.distplot(combined_df[skew_col[i]], fit=norm, kde=False, ax=ax)\n# plt.show()",No,5,76.0 "# #LightGBM\n# import lightgbm as lgb\n# #\n# import optuna\n\n# lgb_train = lgb.Dataset(X_train, y_train)\n# lgb_eval = lgb.Dataset(X_test, y_test)",No,5,76.0 "# def objective(trial):\n# params = {'metric': {'rmse'},\n# 'max_depth' : trial.suggest_int('max_depth', 1, 10),\n# 'subsumple' : trial.suggest_uniform('subsumple', 0.0, 1.0),\n# 'subsample_freq' : trial.suggest_int('subsample_freq', 0, 1),\n# 'leaning_rate' : trial.suggest_loguniform('leaning_rate', 1e-5, 1),\n# 'feature_fraction' : trial.suggest_uniform('feature_fraction', 0.0, 1.0),\n# 'lambda_l1' : trial.suggest_uniform('lambda_l1' , 0.0, 1.0),\n# 'lambda_l2' : trial.suggest_uniform('lambda_l2' , 0.0, 1.0)}\n \n# gbm = lgb.train(params,\n# lgb_train,\n# valid_sets=(lgb_train, lgb_eval),\n# num_boost_round=10000,\n# early_stopping_rounds=100,\n# verbose_eval=50)\n# predicted = gbm.predict(X_test)\n# RMSE = np.sqrt(mean_squared_error(y_test, predicted))\n \n# pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'rmse')\n# return RMSE",No,5,76.0 "# study = optuna.create_study()\n# study.optimize(objective, timeout=360)",No,5,76.0 # print(\,No,5,76.0 "\n# #Optuna\n# params = {""metric"": {\",No,5,76.0 "# #\n# lgb.plot_importance(gbm, height=0.5, figsize=(8,16))",No,5,76.0 # \n# prediction_log = gbm.predict(test_X)\n# print(prediction_log)\n# prediction =np.exp(prediction_log) \n# print(prediction),No,5,76.0 "# cap revenue at 10,000,000 for outliers\n# df.loc[df['revenue'] > 10000000, 'revenue'] = 10000000",No,5,76.0 "# train_profile = ProfileReport(train, title='Pandas Profiling Report', html={'style':{'full_width':True}})\n# train_profile",No,5,76.0 "# test_profile = ProfileReport(test, title='Pandas Profiling Report', html={'style':{'full_width':True}})\n# test_profile",No,5,76.0 # date_encoded = {}\n# for s in train[\,No,5,76.0 # train['date_encoded'] = train['Date'].apply(lambda x: date_encoded[x])\n# train['date_encoded'] = (train['date_encoded'] - train['date_encoded'].mean()) / train['date_encoded'].std()\n# train.head(),No,5,76.0 #INVERSE TRANSFORM\n#pred_f = pred_f*scale,No,5,76.0 "#result.set_index('ForecastId', inplace=True)",No,5,76.0 "#result= result[['ConfirmedCases','Fatalities']].round(0)",No,5,76.0 " #submission['ConfirmedCases'] = [0 if submission.loc[i, 'ConfirmedCases'] <= -0 \n # else submission.loc[i, 'ConfirmedCases'] for i in submission.index]",No,5,76.0 "'''train = df_train.values\nX_train, y_train = train[:,:-2], train[:,-2:]'''",No,5,76.0 "'''model1 = XGBRegressor(\n learning_rate =0.1,\n n_estimators=1000,\n max_depth=5,\n min_child_weight=1,\n gamma=0,\n subsample=0.8,\n colsample_bytree=0.8,\n objective= 'reg:squarederror',\n scale_pos_weight=1)\nmodelfit(model1, X_train, y_train[:,0])'''",No,5,76.0 "'''model2 = XGBRegressor(\n learning_rate =0.1,\n n_estimators=1000,\n max_depth=5,\n min_child_weight=1,\n gamma=0,\n subsample=0.8,\n colsample_bytree=0.8,\n objective= 'reg:squarederror',\n scale_pos_weight=1)\nmodelfit(model2, X_train, y_train[:,1])'''",No,5,76.0 "'''df_submit.ConfirmedCases = df_submit.ConfirmedCases.apply(lambda x:max(0,round(x,0)))\ndf_submit.Fatalities = df_submit.Fatalities.apply(lambda x:max(0,round(x,0)))'''",No,5,76.0 #?TabularPandas,No,5,76.0 "# estimators = [('rf',RF_model ), ('ada', adaboost_model_for_ConfirmedCases)]\n# stacking_model_for_ConfirmedCases = StackingClassifier(estimators=estimators, n_jobs=4)\n# stacking_model_for_ConfirmedCases.fit(train_numeric_X, train_numeric_Y[numeric_features_Y[0]])",No,5,76.0 "# stacking_model_for_Fatalities = StackingClassifier(estimators=estimators, n_jobs=4)\n# stacking_model_for_Fatalities.fit(train_numeric_X, train_numeric_Y[numeric_features_Y[1]])",No,5,76.0 "# predicted = stacking_model_for_ConfirmedCases.predict(test_numeric_X)\n# predicted2 = stacking_model_for_Fatalities.predict(test_numeric_X)\n\n# submission = np.vstack((test['ForecastId'], predicted,predicted2)).T\n# submission = submission.astype(np.int32)\n\n# df = pd.DataFrame(data=submission, columns=['ForecastId','ConfirmedCases','Fatalities'])\n# df.to_csv('stacking_submission.csv', index=False)\n# df.to_csv('submission.csv', index=False)",No,5,76.0 # from sklearn.neighbors import KNeighborsClassifier\n# from sklearn.naive_bayes import GaussianNB \n# from sklearn.linear_model import LogisticRegression\n# from sklearn import model_selection\n# from mlxtend.classifier import StackingCVClassifier,No,5,76.0 # clf1 = KNeighborsClassifier(n_neighbors=100)\n# clf2 = RandomForestClassifier(n_estimators=5)\n# clf3 = GaussianNB()\n# # Logit will be used for stacking\n# lr = LogisticRegression(solver=\,No,5,76.0 "# Tried adding spaCy coref as a feature. I have no expertise with this library.\n# It seems to work in many cases, but for some cases the coref resolves to just \n# he/she/etc rather than a noun. Not sure if it is because the coref model is \n# not confident, I'm navigating the object model incorrectly, or just a limitation\n# of the model. But I do see some gain.\n\n# forked from: https://www.kaggle.com/shujian/ml-model-example-with-train-test\n# loading spaCy coref extension like: https://www.kaggle.com/ryches/applying-spacy-coreference-but-nothing-goes-right",No,5,76.0 #os.system(f'ls {mf}1'),No,5,76.0 "#fig, (axis1) = plt.subplots(1,1,figsize=(8,3))\n#sns.countplot(x = 'Open', hue = 'DayOfWeek', data = data_train,)",No,5,76.0 "#sns.factorplot(x =""Year"", y =""Sales"", hue =""Promo"", data = data_train, size = 3, kind =""box"", palette =""muted"")",No,5,76.0 "#sns.factorplot(x =""Year"", y =""Sales"", hue =""SchoolHoliday"", data = data_train, size = 3, kind =""box"", palette =""muted"")",No,5,76.0 "#sns.factorplot(x =""Year"", y =""Sales"", hue =""HolidayBin"", data = data_train, size = 4, kind =""bar"", palette =""muted"")",No,5,76.0 "#fig, (axis1, axis2, axis3) = plt.subplots(1, 3, figsize=(12,3))\n#sns.barplot(average_store_type.index, average_store_type['Sales'], ax=axis1)\n#sns.barplot(average_store_type.index, average_store_type['Customers'], ax=axis2)\n#sns.barplot(average_store_type.index, average_store_type['CompetitionDistance'], ax=axis3)",No,5,76.0 "#fig, (axis1,axis2) = plt.subplots(1,2,figsize=(12,3))\n#sns.barplot(average_assort.index, average_assort['Sales'], ax=axis1)\n#ns.barplot(average_assort.index, average_assort['Customers'], ax=axis2)",No,5,76.0 "# ( )\n# res_list_depth = []\n# res_list_nestim = []\n# for i in range (1, 1000, 50):\n# store_part = train_stores[i]\n# X_train_part = store_part.drop([""Sales"", ""Store"", ""Customers""],axis=1)\n# Y_train_part = store_part[""Sales""]\n# X_train_part = X_train_part.fillna(X_train_part.mean())\n# estimator = RandomForestRegressor(random_state=42, criterion = \",No,5,76.0 "# : 0.14930 - pub, 0.13491 - priv",No,5,76.0 #df_store[pd.isnull(df_store.Promo2SinceWeek)]\n#df_store[pd.isnull(df_store.Promo2SinceWeek)& (df_store.Promo2==0)],No,5,76.0 "#!kaggle competitions submit -c rossmann-store-sales -f rossmann_submission.csv -m ""rossman with extra features""",No,5,76.0 #train_sales = np.log(train_sales),No,5,76.0 #preds = np.exp(predictions),No,5,76.0 "\n#df_train_store[['StateHoliday', 'StoreType', 'Assortment']] = df_train_store[['StateHoliday', 'StoreType', 'Assortment']].apply(lambda x: x.cat.codes)",No,5,76.0 "#This is the best combination i got from what i propose to try out with a (mse) score of 0.855 which is quite good\n#grid.best_params_,grid.best_score_\n#MY BEST PARAMS ARE :n_estimators=128,max_depth=20,min_samples_split=10",No,5,76.0 # now using Xgb ,No,5,76.0 "#X_train, X_valid, y_train, y_valid = train_test_split(x_train, y, train_size=0.8, test_size=0.2,\n # random_state=0)",No,5,76.0 # store_rows[store_rows['Sales']==0],No,5,76.0 # store.isna.sum(),No,5,76.0 # Decision tress - label encoding should be used.\n# regression - one hot encoding must be used.,No,5,76.0 # submitting the train on test data set,No,5,76.0 "\n# parameters={'max_depth':list(range(5,20))}\n# base_model=DTR()\n# cv_model=GridSearchCV(base_model,param_grid=parameters,cv=5,return_train_score=True).fit(X_train,y_train)\n",No,5,76.0 # cv_model.best_params_,No,5,76.0 "# cv_results_1=pd.DataFrame(cv_model.cv_results_).sort_values(by='mean_test_score',ascending=False)\n# cv_results=pd.DataFrame(cv_model.cv_results_).sort_values(by='mean_test_score',ascending=False)\n# cv_results_1.set_index('param_max_depth')['mean_test_score'].plot.line()\n# cv_results_1.set_index('param_max_depth')['mean_train_score'].plot.line()\n# plt.legend(['test','train'])",No,5,76.0 #!pip install pydotplus,No,5,76.0 "# def draw_tree(model, columns):\n# import pydotplus\n# from sklearn.externals.six import StringIO\n# from IPython.display import Image\n# import os\n# from sklearn import tree\n \n# graphviz_path = \",No,5,76.0 "# draw_tree(model_dtr,data_merged.columns.drop(['Sales','Date']))",No,5,76.0 #df['Date'].dt.strftime('%a'),No,5,76.0 "# from sklearn.model_selection import GridSearchCV\n\n# parameters={'max_depth':list(range(5,20))} # parmeters{'max_depth':list(range(5,20),'min_sample_split':[5,10,20])}\n# base_model=DecisionTreeRegressor()\n# cv_model=GridSearchCV(base_model,param_grid=parameters,cv=5,return_train_score=True).fit(train_x,train_y)",No,5,76.0 "# df_cv_results=pd.DataFrame(cv_model.cv_results_).sort_values(by='mean_test_score',ascending=False)[['param_max_depth','mean_test_score','mean_train_score']]\n# plt.figure(figsize=(10,5))\n# df_cv_results.set_index('param_max_depth')['mean_test_score'].plot.line()\n# df_cv_results.set_index('param_max_depth')['mean_train_score'].plot.line()\n# print(df_cv_results)\n",No,5,76.0 "# learn.fit_one_cycle(5, 5e-4, wd=0.1)\n# learn.recorder.plot_losses()",No,5,76.0 # #colab\n# # google-drive-ocamlfuse\n# # https://github.com/astrada/google-drive-ocamlfuse\n# !apt-get install -y -qq software-properties-common python-software-properties module-init-tools\n# !add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null\n# !apt-get update -qq 2>&1 > /dev/null\n# !apt-get -y install -qq google-drive-ocamlfuse fuse\n\n# # ColabAuth token\n# from google.colab import auth\n# auth.authenticate_user()\n\n# # Drive FUSE librarycredential\n# from oauth2client.client import GoogleCredentials\n# creds = GoogleCredentials.get_application_default()\n# import getpass\n# !google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL\n# vcode = getpass.getpass()\n# !echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}\n\n# !mkdir -p drive\n# !google-drive-ocamlfuse -o nonempty drive,No,5,76.0 # !pip install kaggle\n# !echo \,No,5,76.0 # !ls\n# !unzip store.csv.zip\n# !unzip train.csv.zip\n# !unzip test.csv.zip,No,5,76.0 "# !pip install kaggle\n# !kaggle competitions submit -c rossmann-store-sales -f submission.csv -m ""submision""",No,5,76.0 # from google.colab import files\n# files.download('out.csv') \n# files.download('weights_rossmann.best.hdf5') \n,No,5,76.0 # !rsync -avz --progress ./model/model_both_a_13.pkl ../drive/Job/,No,5,76.0 #joined.to_pickle(PATH/'joined')\n#joined_test.to_pickle(PATH/'joined_test'),No,5,76.0 #df.to_pickle(PATH/'df'),No,5,76.0 #joined = pd.read_pickle(PATH/'joined')\n#joined_test = pd.read_pickle(PATH/f'joined_test'),No,5,76.0 #joined.to_pickle(path/'train_clean')\n#joined_test.to_pickle(path/'test_clean'),No,5,76.0 "result = pd.concat([test_dataset.ForecastId,output_confirmed_cases_df, output_fatalities_df], axis=1)\n# result.index.names = ['indexes']\n# result.columns.name = result.index.name\n# result.index.name = None\nresult\n",No,4,76.0 "# #Normalizing\n\n# no = 1\n\n# X[""PRI_jet_all_pt""]=((X[""PRI_jet_all_pt""]-X[""PRI_jet_all_pt""].min())/(X[""PRI_jet_all_pt""].max()-X[""PRI_jet_all_pt""].min()))*no\n# X_test[""PRI_jet_all_pt""]=((X_test[""PRI_jet_all_pt""]-X_test[""PRI_jet_all_pt""].min())/(X_test[""PRI_jet_all_pt""].max()-X_test[""DER_mass_MMC""].min()))*no\n\n# X[""PRI_jet_subleading_pt""]=((X[""PRI_jet_subleading_pt""]-X[""PRI_jet_subleading_pt""].min())/(X[""PRI_jet_subleading_pt""].max()-X[""PRI_jet_subleading_pt""].min()))*no\n# X_test[""PRI_jet_subleading_pt""]=((X_test[""PRI_jet_subleading_pt""]-X_test[""PRI_jet_subleading_pt""].min())/(X_test[""PRI_jet_subleading_pt""].max()-X_test[""PRI_jet_subleading_pt""].min()))*no\n\n# X[""PRI_jet_leading_pt""]=((X[""PRI_jet_leading_pt""]-X[""PRI_jet_leading_pt""].min())/(X[""PRI_jet_leading_pt""].max()-X[""PRI_jet_leading_pt""].min()))*no\n# X_test[""PRI_jet_leading_pt""]=((X_test[""PRI_jet_leading_pt""]-X_test[""PRI_jet_leading_pt""].min())/(X_test[""PRI_jet_leading_pt""].max()-X_test[""PRI_jet_leading_pt""].min()))*no\n\n# X[""PRI_met_sumet""]=((X[""PRI_met_sumet""]-X[""PRI_met_sumet""].min())/(X[""PRI_met_sumet""].max()-X[""PRI_met_sumet""].min()))*no\n# X_test[""PRI_met_sumet""]=((X_test[""PRI_met_sumet""]-X_test[""PRI_met_sumet""].min())/(X_test[""PRI_met_sumet""].max()-X_test[""PRI_met_sumet""].min()))*no\n\n# X[""DER_sum_pt""]=((X[""DER_sum_pt""]-X[""DER_sum_pt""].min())/(X[""DER_sum_pt""].max()-X[""DER_sum_pt""].min()))*no\n# X_test[""DER_sum_pt""]=((X_test[""DER_sum_pt""]-X_test[""DER_sum_pt""].min())/(X_test[""DER_sum_pt""].max()-X_test[""DER_sum_pt""].min()))*no\n\n# X[""DER_mass_jet_jet""]=((X[""DER_mass_jet_jet""]-X[""DER_mass_jet_jet""].min())/(X[""DER_mass_jet_jet""].max()-X[""DER_mass_jet_jet""].min()))*no\n# X_test[""DER_mass_jet_jet""]=((X_test[""DER_mass_jet_jet""]-X_test[""DER_mass_jet_jet""].min())/(X_test[""DER_mass_jet_jet""].max()-X_test[""DER_mass_jet_jet""].min()))*no\n\n# X[""DER_pt_h""]=((X[""DER_pt_h""]-X[""DER_pt_h""].min())/(X[""DER_pt_h""].max()-X[""DER_pt_h""].min()))*no\n# X_test[""DER_pt_h""]=((X_test[""DER_pt_h""]-X_test[""DER_pt_h""].min())/(X_test[""DER_pt_h""].max()-X_test[""DER_pt_h""].min()))*no\n\n# X[""DER_mass_vis""]=((X[""DER_mass_vis""]-X[""DER_mass_vis""].min())/(X[""DER_mass_vis""].max()-X[""DER_mass_vis""].min()))*no\n# X_test[""DER_mass_vis""]=((X_test[""DER_mass_vis""]-X_test[""DER_mass_vis""].min())/(X_test[""DER_mass_vis""].max()-X_test[""DER_mass_vis""].min()))*no\n\n# X[""DER_mass_transverse_met_lep""]=((X[""DER_mass_transverse_met_lep""]-X[""DER_mass_transverse_met_lep""].min())/(X[""DER_mass_transverse_met_lep""].max()-X[""DER_mass_transverse_met_lep""].min()))*no\n# X_test[""DER_mass_transverse_met_lep""]=((X_test[""DER_mass_transverse_met_lep""]-X_test[""DER_mass_transverse_met_lep""].min())/(X_test[""DER_mass_transverse_met_lep""].max()-X_test[""DER_mass_transverse_met_lep""].min()))*no\n\n# X[""DER_mass_MMC""]=((X[""DER_mass_MMC""]-X[""DER_mass_MMC""].min())/(X[""DER_mass_MMC""].max()-X[""DER_mass_MMC""].min()))*no\n# X_test[""DER_mass_MMC""]=((X_test[""DER_mass_MMC""]-X_test[""DER_mass_MMC""].min())/(X_test[""DER_mass_MMC""].max()-X_test[""DER_mass_MMC""].min()))*no\n\n\n# X.head()",No,5,76.0 # # normalize the data attributes\n# X = X.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))\n\n# X_test = X_test.apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))\n\n\n# X.head(),No,5,76.0 "#X = X.replace(-999.000,np.nan)\n#X.head()",No,5,76.0 "#X_test = X_test.replace(-999.000,np.nan)",No,5,76.0 #X_test.head(),No,5,76.0 "#X = X.replace(-999.000,0)\n#X_test = X_test.replace(-999.000,0)\n#X.head()",No,5,76.0 "#X.fillna(X.median(), inplace=True)\n#X_test.fillna(X_test.median(), inplace=True)\n\n#X.head()",No,5,76.0 #X.tail(1000),No,5,76.0 "# train_set = train_set.drop(['Soil_Type'+str(idx) for idx in range(1, 41)], axis=1)\n# train_set = train_set.drop(['Wilderness_Area'+str(idx) for idx in range(1, 5)], axis=1)",No,5,76.0 "#Final submission score is 0.67,which needs to be improved.",Yes,5,76.0 "#Soil_Type7,Soil_Type15 has 0 standard deviation\n#train = train.drop([""Soil_Type7"",""Soil_Type15""],axis = 1)\n#test = test.drop([""Soil_Type7"",""Soil_Type15""],axis = 1)",No,5,76.0 "#of the 3 algorithms applied to the dataset,ensemble model works better with a score of 0.84\n",No,5,76.0 #Getting feature importance after running the data through a ensemble model classifier,No,5,76.0 "#Both the models have same important features,also these important features completely ignores the soil type feature.\n#This should be included,except Soil_Type7,Soil_Type15 has lower standard deviation.\n\n",No,5,76.0 #Naive Bayes Model,No,5,76.0 "#SGDClassifier,very low performance",No,5,76.0 #df_test1.head(),No,5,76.0 #y_pred = classifier.predict(df_test1),No,5,76.0 "#solution = pd.DataFrame({'Id':df_Test1.Id, 'Cover_Type':y_pred}, columns = ['Id','Cover_Type'])\n#solution.to_csv('SVMcover_sol.csv', index=False)",Yes,5,76.0 "# gradientBoostingModel = GradientBoostingClassifier(loss = 'deviance',\n# learning_rate = 0.01,\n# n_estimators = 100,\n# max_depth = 30,\n# random_state=10)\n\n# gradientBoostingModel.fit(X_train,y_train)",No,5,76.0 "# SGDClassifier = SGDClassifier(loss = 'hinge', \n# penalty = 'l1',\n# learning_rate = 'optimal',\n# random_state = 10, \n# max_iter=100)\n\n# SGDClassifier.fit(X_train,y_train)",No,5,76.0 "# SVClassifier = SVC(kernel= 'linear',\n# degree=3,\n# max_iter=10000,\n# C=2, \n# random_state = 55)\n\n# SVClassifier.fit(X_train,y_train)",No,5,76.0 # Forest Cover Prediction,Yes,5,76.0 #estimator.get_params().keys(),No,5,76.0 "##parameters_grid = {\n## 'model_fitting__n_estimators' : [70, 100, 130],\n## 'model_fitting__max_features' : [3, 4, 5, 6],\n##}\n##\n##grid_cv = grid_search.GridSearchCV(estimator, parameters_grid, scoring = 'neg_mean_absolute_error', cv = 3)\n##grid_cv.fit(train_data, train_labels)\n##\n##print(-grid_cv.best_score_)\n##print(grid_cv.best_params_)",No,5,76.0 #union_data['hour_type']=0\n#union_data['hour_type'][(union_data['hour']<=1)]='1'\n#union_data['hour_type'][(union_data['hour']>=2)& (union_data['hour']<=4)]='2'\n#union_data['hour_type'][(union_data['hour']>=4)& (union_data['hour']<=6)]='3'\n#union_data['hour_type'][(union_data['hour']>=6)& (union_data['hour']<=8)]='4'\n#union_data['hour_type'][(union_data['hour']>=9)& (union_data['hour']<=15)]='5'\n#union_data['hour_type'][(union_data['hour']>=16)& (union_data['hour']<=18)]='6'\n#union_data['hour_type'][(union_data['hour']>=19)& (union_data['hour']<=20)]='7'\n#union_data['hour_type'][(union_data['hour']>=21)]='8'\n\n,No,5,76.0 "# It is undeniable that working days have a distinct behavior on their own. So we assume that the separation non working days vs working days is a valid one. However,\n# Looking at the non working days the conclusion is less obvious. We will assume that for seasons 3 and 4 we have a distinct pattern for non working days and leave a \n# possible refinement to a later version. Lets see if we can find a separation for weekends and public holidays for seasons 1 and 2.\n\n# In what follows we will repeat the steps above but now, instead of X_train we will have X_holiday_1, instead of X_workingday_1 we will have X_1_public_holidays \n# and X_1_weekends instead X_holiday_1. And the same for season 2. We try to figure out if this separation will increase the ration mean counts /std counts at least\n# for one of these groups",No,5,76.0 "# Conclusion: We can observe that, for season 1, by separating the public holidays from the weekends we observe a better ratio for the public holidays, while for\n# weekends this ratio is higher for only 50% of the hours, while for season 2, the conclusion is the opposite.",No,5,76.0 # from sklearn.tree import DecisionTreeClassifier\n# from sklearn.tree import DecisionTreeRegressor\n\n# # model = DecisionTreeClassifier()\n\n# # random_state .\n# #model = DecisionTreeClassifier(random_state=37)\n# model = DecisionTreeRegressor(random_state=37)\n# model,No,5,76.0 "'''\nfor idx_train, idx_test in ms.split(df_train_data):\n csv = linear_model.Ridge().fit(df_train_data.iloc[idx_train], \\\n df_train_target.iloc[idx_train])\n print('train score: {0: .3f}, test score: {1: .3f}'.format(\n csv.score(df_train_data.iloc[idx_train], df_train_target.iloc[idx_train]),\n csv.score(df_train_data.iloc[idx_test], df_train_target.iloc[idx_test])\n ))\n'''",No,5,76.0 "'''\nfor idx_train, idx_test in ms.split(df_train_data):\n csv = svm.SVR(kernel='rbf', C=10, gamma=0.001).fit(df_train_data.iloc[idx_train],\\\n df_train_target.iloc[idx_train])\n print('train score: {0: .3f}, test score: {1: .3f}'.format(\n csv.score(df_train_data.iloc[idx_train], df_train_target.iloc[idx_train]),\n csv.score(df_train_data.iloc[idx_test], df_train_target.iloc[idx_test])\n ))\n'''",No,5,76.0 "#df_train_data_notime = df_train_data.drop(['hour', 'dayofweek', 'month'], axis=1)",No,5,76.0 "# for idx_train,idx_test in ms.split(df_train_data):\n# csv = RandomForestRegressor(n_estimators=500).fit(df_train_data.iloc[idx_train],\\\n# df_train_target.iloc[idx_train])\n# print('train score: {0: .3f}, test score: {1: .3f}'.format(\n# csv.score(df_train_data.iloc[idx_train], df_train_target.iloc[idx_train]),\n# csv.score(df_train_data.iloc[idx_test], df_train_target.iloc[idx_test])\n# ))",No,5,76.0 "'''\nfor idx_train,idx_test in ms.split(df_train_data):\n csv = RandomForestRegressor(n_estimators=100).fit(df_train_data_notime.iloc[idx_train],\\\n df_train_target.iloc[idx_train])\n print('train score: {0: .3f}, test score: {1: .3f}'.format(\n csv.score(df_train_data_notime.iloc[idx_train], df_train_target.iloc[idx_train]),\n csv.score(df_train_data_notime.iloc[idx_test], df_train_target.iloc[idx_test])\n ))\n'''",No,5,76.0 "# estimator2 = RandomForestRegressor(n_estimators=200, max_features=0.6, max_depth=15)\n# plot_learning_curve(estimator2, title, \n# df_train_data, df_train_target, ylim=(0.7, 1.01), cv=cv, n_jobs=4)\n# plt.show()",No,5,76.0 #df_sample['count'] = df_sample['count'].apply(lambda x: int(x + 0.5)),No,5,76.0 ####,No,5,76.0 "# sns.factorplot(x=""month"",y=""count"",data=train_set,kind=\",No,5,76.0 # train_set['high_time'] = np.zeros_like(train_set['time'])\n# train_set['high_time'].loc[(((train_set['time'] > 6) & (train_set['time'] < 15)) | (train_set['time'] == 20))] = 1\n# train_set['high_time'].loc[((train_set['time'] == 8) | (train_set['time'] == 16) | (train_set['time'] == 19))] = 2\n# train_set['high_time'].loc[((train_set['time'] == 17) | (train_set['time'] == 18))] = 3,No,5,76.0 "# def RMSLE(y_hat, data):\n# y_true = data.get_label()\n# y_hat = np.round(y_hat)\n# y_hat[y_hat<0]=0\n# return 'rmlse', np.sqrt(mean_squared_log_error(y_true, y_hat)), True",No,5,76.0 "# d_train = lgb.Dataset(X, label=y)\n# params = {'objective': 'regression', 'metric': 'rmsle', 'random_state': 501, 'verbose': 0, 'reg_alpha ': 0.1, 'reg_lambda': 0.1}",No,5,76.0 "# lgb_cv = lgb.cv(\n# params, \n# d_train,\n# metrics = 'rmsle',\n# feval= RMSLE,\n# nfold=5,\n# verbose_eval = 5)",No,5,76.0 "# lgb_model = lgb.train(\n# params, \n# d_train,\n# feval= RMSLE,\n# verbose_eval = 5)",No,5,76.0 "# d_importance = pd.DataFrame(columns=['features'], data=X.columns)\n# d_importance['gain_importance'] = lgb_model.feature_importance(importance_type='gain')\n# d_importance['split_importance'] = lgb_model.feature_importance(importance_type='split')\n# d_importance.sort_values(by='gain_importance',ascending=False).head(25)",No,5,76.0 "# xgb_model = XGBRegressor(colsample_bytree=0.7, learning_rate=0.05, max_depth=7, min_child_weight=4, subsample=0.7, random_state=42)\n# xgb_model.fit(X, y)",No,5,76.0 "# def rmsle(y_true, y_hat):\n# y_hat = np.round(y_hat)\n# y_hat[y_hat<0]=0\n# return np.sqrt(mean_squared_log_error(y_true, y_hat))\n\n# rmsle_score = make_scorer(rmsle, greater_is_better=False)",No,5,76.0 "# scores = cross_val_score(xgb_model, X, y, cv=5, scoring=rmsle_score)\n# print(""scores "", np.abs(scores))",No,5,76.0 "# d_importance = pd.DataFrame(columns=['features'], data=X.columns)\n# d_importance['importance'] = xgb_model.feature_importances_\n# d_importance.sort_values(by='importance',ascending=False).head(20)",No,5,76.0 "#grid_params = {'max_depth' : [12,14,16]}\n#grid_xgb = GridSearchCV(xgb_clf, grid_params, cv= 5)\n#grid_xgb.fit(x_train, y_train)\n#print(grid_xgb.best_score_)\n#grid_xgb.cv_results_\n#grid_xgb.score(x_test, y_test)",No,5,76.0 #!rm submission.csv,No,5,76.0 "# features_cyc = ['hour', 'weekday']\n# for feature in features_cyc:\n# train_data[feature+'_sin'] = np.sin((2*np.pi*train_data[feature])/max(train_data[feature]))\n# train_data[feature+'_cos'] = np.cos((2*np.pi*train_data[feature])/max(train_data[feature]))\n# test_data[feature+'_sin'] = np.sin((2*np.pi*test_data[feature])/max(test_data[feature]))\n# test_data[feature+'_cos'] = np.cos((2*np.pi*test_data[feature])/max(test_data[feature]))\n# train_data = train_data.drop(features_cyc, axis=1)\n# test_data = test_data.drop(features_cyc, axis=1)",No,5,76.0 "# TIP) .\n# time = train['datetime'].str.slice(11,13).astype(int)\n# time.head()\n",No,5,76.0 "# validate dataset 7:3\n#from sklearn.model_selection import train_test_split\n#train_x, validate_x, train_y, validate_y = train_test_split(train, y, test_size = 0.3,\n #random_state = 777)",No,5,76.0 "# . , .\n#from sklearn.ensemble import RandomForestRegressor\n# , . n_estimator .\n# cpu 1 . n_jobs=4 -1 CPU \n# random_state set.seed() \n#rf = RandomForestRegressor(n_estimators=100, n_jobs=-1,random_state=999)\n#rf.fit(train, y)",No,5,76.0 #result = rf.predict(test)\n ,No,5,76.0 "#from lightgbm import LGBMRegressor\n# boosting hyper parameter (, )\n#lgbm = LGBMRegressor()\n#lgbm.fit(train, y)\n",No,5,76.0 #preds = lgbm.predict(test),No,5,76.0 "# 0 .\n# , .\n# train.loc[train[""windspeed""] == 0, ""windspeed""] = train[""windspeed""].mean()\n# test.loc[train[""windspeed""] == 0, ""windspeed""] = train[""windspeed""].mean()",No,5,76.0 # realizando as tranformaes nos dados,No,5,76.0 "'''\nfrom sklearn.ensemble import RandomForestRegressor\nrf=RandomForestRegressor(n_estimators=100,random_state=0)\nrf.fit(X,Y)\nimp_list=rf.feature_importances_\nfeats = {} # a dict to hold feature_name: feature_importance\nfor feature, importance in zip(final_df.columns, rf.feature_importances_):\n feats[feature] = importance #add the name/value pair\n''' ",No,5,76.0 "#month => train count test count . , \n# . train \n# test . 20~31 . ",No,5,76.0 160/3251,No,3,76.0 "# scores.mean(), scores",No,5,76.0 "# from sklearn.cross_validation import cross_val_predict\n# y_pred = cross_val_predict(LogisticRegression(), X, Y, cv=10, n_jobs=-1, verbose=1)\n# log_loss(Y, y_pred)",No,5,76.0 "# from sklearn.model_selection import StratifiedKFold\n# kf = StratifiedKFold(n_splits=10, random_state=0)\n# pred = np.zeros((Y.shape[0], Y.nunique()))\n# for train_index, test_index in kf.split(X, Y):\n# X_train, X_test = X.iloc[train_index], X.iloc[test_index]\n# y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]\n# lr = LogisticRegression(solver=\",No,5,76.0 "# from sklearn.ensemble import RandomForestClassifier\n# scores = cross_val_score(RandomForestClassifier(n_estimators=100), X, Y, scoring='neg_log_loss',cv=10, verbose=1)",No,5,76.0 "# scoresmean(), scores.",No,5,76.0 "#del train,valid,testdata\n#gc.collect()",No,5,76.0 "#clf = LogisticRegression(C=0.02)\n#clf.fit(Xtrain, y)\n#clf.predict_proba(Xtrain[70000:], y[70000:])\n#log_loss(yte, pred[itest, :])",No,5,76.0 "#pred = clf.predict_proba(Xtrain[70000:])\n#log_loss(y[70000:], pred)",No,5,76.0 "#pred = pd.DataFrame(clf.predict_proba(Xtest), index=ga_test.index, columns=target_encoder.classes_)\n#pred.head()\n#pred.to_csv('logreg_subm.csv',index=True)",No,5,76.0 ##train_users[train_users['id'] == 'bibf93h56j']\n##train_users['date_first_booking'].isnull(),No,5,76.0 ##train_users.head()\n##train_users[train_users['first_browser_grouped'] == 'Mobile']\n\n#### language doesn't appear that helpful.. anyway we can adjust it some?\n\n#train_users.head(),No,5,76.0 \n#test = [0]\n#train_users[\,No,5,76.0 "#fig, (axis1, axis2) = plt.subplots(2,1,figsize=(15,10))\n#sns.countplot(x=\",No,5,76.0 "\n\n#fig, (axis1) = plt.subplots(1,1,figsize=(15,5))\n#sns.countplot(x=\",No,5,76.0 ##### is it worthwhile to group up some of these X vars w/ a lot of subclasses? \n,No,5,76.0 "# from sklearn.model_selection import cross_val_score\n# from sklearn.ensemble import GradientBoostingClassifier\n\n# np.random.seed(42)\n# samples = np.random.choice(piv_train, 10000)\n# X_train = vals[samples]\n# y_train = le.fit_transform(labels)[samples]\n# model = GradientBoostingClassifier()\n# cross_val_score(model, X_train, y_train, cv=5, n_jobs=-1)",No,5,76.0 "# Too Much Unknown Data In Columns : Gender & First Browser , will need to fix that later\n# Now Let's Focus on the Dates Data",No,5,76.0 #Building the Classfication Model,No,5,76.0 # Using Submession System To Evaluate The Model,No,5,76.0 "# #trust your CV!\n# best_parameters,score = max(clf.scorer_, key=lambda x: x[1])\n# print(\",No,5,76.0 "# fig, ax = plt.subplots(5, 2, figsize=(10, 50))\n# for i in range(5):\n# ax[i, 0].imshow(X_train_zero[i])\n# ax[i, 1].imshow(Y_train_zero[i])\n# print(np.unique(Y_train_zero[i]))",No,5,76.0 "# fig, ax = plt.subplots(5, 2, figsize=(10, 50))\n# for i in range():\n# ax[i, 0].imshow(X_train_one[i], 'gray')\n# Y_train_one = np.array(Y_train_one, dtype='bool')\n# # Y_train_one[i][Y_train_one]=1\n# ax[i, 1].imshow(Y_train_one[i], 'gray')\n# print(np.unique(Y_train_one[i]))",No,5,76.0 "# for img, mask in zip(X_train_zero, Y_train_zero):\n# X_train.append(img)\n# Y_train.append(mask)",No,5,76.0 "# X_train_all = np.concatenate((X_train, X_train_zero[:1000]), axis=0)\n# Y_train_all = np.concatenate((Y_train, Y_train_zero[:1000]), axis=0)",No,5,76.0 "# model = Unet('densenet121',encorder_weights='imagenet',freeze_encorder=True)",No,5,76.0 "# N = 1\n\n# base_model = Unet(backbone_name='resnet34', encoder_weights='imagenet')\n\n# inp = Input(shape=(None, None, N))\n# l1 = Conv2D(3, (1, 1))(inp) # map N channels data to 3 channels\n# out = base_model(l1)\n\n# model = Model(inp, out, name=base_model.name)\n# model.compile(optimizer=Adam(lr = 1e-5), loss=dice_coef_loss, metrics=[dice_coef])",No,5,76.0 #embedding_dim = len(tokenizer.word_index)+1,No,5,76.0 "#my_submissionxgb = pd.DataFrame({'id': test.id, 'country':resultsxgb})\n#my_submissionxgb.to_csv('submissionxgb.csv', index=False)",No,5,76.0 "# vocab = vectorizer.get_feature_names()\n# dist = np.sum(train_data_features, axis=0)\n# for tag, count in zip(vocab, dist):\n# print (count, tag)",No,5,76.0 "# test = pd.read_csv(""../input/test.tsv"", sep=\",No,5,76.0 "#train_data = train.drop(['Phrase'], axis=1)\n#test_data = test.drop(['Phrase'], axis=1)",No,5,76.0 "#np.savetxt(""outpu.csv"", y1, delimiter="","")\n",No,5,76.0 "#train[""day of week""].value_counts()",No,5,76.0 "#grid_search.fit(X_train, y_train)",No,5,76.0 #!pip install xgboost,No,5,76.0 #from sklearn.model_selection import GridSearchCV\n#from sklearn.model_selection import ShuffleSplit,No,5,76.0 "'''\nxgb1 = xg_reg\nparameters = {'nthread':[3], #when use hyperthread, xgboost may become slower\n 'objective':['reg:linear'],\n 'learning_rate': [.03, 0.05, .07], #so called `eta` value\n 'max_depth': [5, 6, 7],\n 'min_child_weight': [4],\n 'silent': [1],\n 'subsample': [0.7],\n 'colsample_bytree': [0.7],\n 'n_estimators': [500]}\n\nxgb_grid = GridSearchCV(xgb1,\n parameters,\n cv = 2,\n n_jobs = 3,\n verbose=True)\n\nxgb_grid.fit(X_train,\n y_train)\n\nprint(xgb_grid.best_score_)\nprint(xgb_grid.best_params_)\n'''",No,5,76.0 "'''xg_reg = xgb.XGBRegressor(colsample_bytree= 0.7, learning_rate= 0.07, max_depth= 5, min_child_weight= 4, n_estimators= 300, nthread= 4, objective= 'reg:linear', silent= 1, subsample=0.7)'''",No,5,76.0 "'''xg_reg.fit(X_train,y_train)\n\nresult = xg_reg.predict(X_test)'''",No,5,76.0 "# sub_prev_year_median.to_csv('submission_prev_year.csv',index=False)\n# FileLink('submission_prev_year.csv')",No,5,76.0 "# sub_prev_year_median.to_csv('sub_median_60.csv',index=False)\n# FileLink('sub_median_60.csv')",No,5,76.0 "#world_population = pd.read_csv(""/kaggle/input/population-by-country-2020/population_by_country_2020.csv"")\n#display(world_population.head()) #for next round",No,5,76.0 "# ## Add 1-year `Weekly_Sales` lag ##\n\n# X_all = pd.concat([X_train, X_test])\n# X_all['Date2'] = pd.to_datetime(X_all['Date'], utc = True)\n# X_all['52_Week_Lag'] = X_all['Date2'] - np.timedelta64(52,'W')\n# X_all_temp = X_all[['Weekly_Sales', 'Date2', 'Store', 'Dept']]\n\n# X_all = X_all.merge(X_all_temp,\n# left_on=['Store', 'Dept', '52_Week_Lag'], \n# right_on=['Store', 'Dept', 'Date2'],\n# how='inner',\n# suffixes=('', '_y'))\n# X_all.rename(columns={'Weekly_Sales_y': 'Weekly_Sales_Lag_52_Weeks'}, inplace=True)\n# X_all = X_all[[col for col in X_all.columns if not col.endswith('_y')]]\n\n# drop_cols = ['Date2_y', '1_Year_Lag']\n# X_all.drop(['52_Week_Lag'], axis=1, inplace=True)\n\n# X_all.isna().sum()",No,5,76.0 "# X_train['Date2'] = pd.to_datetime(X_train['Date'], utc = True)\n# X_test['Date2'] = pd.to_datetime(X_test['Date'], utc = True)\n\n# X_train['Weekly_Sales_Lag_52_Weeks'] = X_train.merge(X_all, \n# left_on=['Store', 'Dept', 'Date2'], \n# right_on=['Store', 'Dept', 'Date2'],\n# how='inner')['Weekly_Sales_Lag_52_Weeks']\n# X_test['Weekly_Sales_Lag_52_Weeks'] = X_test.merge(X_all, \n# left_on=['Store', 'Dept', 'Date2'], \n# right_on=['Store', 'Dept', 'Date2'],\n# how='inner')['Weekly_Sales_Lag_52_Weeks']\n\n# X_test.head()",No,5,76.0 "# sns.set(style=""ticks"", color_codes=True)\n\n# for col in X_train.columns.drop(\",No,5,76.0 ## takes too long ##,No,5,76.0 # hist = pd.DataFrame(history.history)\n# hist.plot(),No,5,76.0 "## create 1-year lag value of store sales (can't use 1-week as test set doesn't have any `Weekly_Sales` ##\n## X_train ends at 2012-10-26 and X_test ends at 2013-07-26, so no NaN values for `Weekly_Sales_Lag` in X_test ##",No,5,76.0 "# X_all = pd.concat([X_train, X_test])\n# X_all.tail()",No,5,76.0 "### GridSearchCV test #X_test, _ = loadData(test, test = True) #prediction = grid_search.predict(X_test)",No,5,76.0 "### RF validation ''' X_train, X_test, y_train, y_test = train_test_split(X, new_y, test_size = 0.33, random_state = 42) rf = RandomForestRegressor() rf.fit(X_train, y_train) prediction = rf.predict(X_test) mean_squared_error(y_test, prediction) '''",No,5,76.0 "# standizer = StandardScaler() # data[np.array(data.columns[:])] = standizer.fit_transform(data[np.array(data.columns[:])]) # test[np.array(test.columns[:])] = standizer.transform(test[np.array(test.columns[:])]) ",No,5,76.0 "# rbs = RobustScaler() # data[np.array(data.columns[:])] = rbs.fit_transform(data[np.array(data.columns[:])]) # test[np.array(test.columns[:])] = rbs.fit_transform(test[np.array(test.columns[:])])",No,5,76.0 "# pca = PCA() # data[np.array(data.columns[:])] = pca.fit_transform(data[np.array(data.columns[:])]) # test[np.array(test.columns[:])] = pca.fit_transform(test[np.array(test.columns[:])])",No,5,76.0 "# krc = KernelCenterer() # data[np.array(data.columns[:])] = krc.fit_transform(data[np.array(data.columns[:])]) # test[np.array(test.columns[:])] = krc.transform(test[np.array(test.columns[:])])",No,5,76.0 "# kf = KFold(n_splits=5,shuffle=True) # random_forest_acc = 0 # adaboost_acc = 0 # extraRandom_acc = 0 # svm_acc = 0 # gradientBoosting_acc = 0 # for train_index, test_index in kf.split(data): # X_train = data.filter(items=train_index, axis=0) # X_test = data.filter(items=test_index, axis=0) # y_train = input_label[train_index] # y_test = input_label[test_index] # # for randomForest # random_forest_clf = RandomForestClassifier(n_estimators=50) # random_forest_clf.fit(X_train, y_train) # rand_given_labels = random_forest_clf.predict(X_test) # random_forest_acc += accuracy_score(y_test, rand_given_labels) # # for AdaBoost # adaboost_clf = AdaBoostClassifier(n_estimators = 100,learning_rate=0.5) # adaboost_clf.fit(X_train, y_train) # ada_given_labels = adaboost_clf.predict(X_test) # adaboost_acc += accuracy_score(y_test, ada_given_labels) # # for extra random forest # extraRandom= ExtraTreesClassifier(n_estimators=100, max_depth=None,min_samples_split=2) # extraRandom.fit(X_train, y_train) # xrand_given_labels = extraRandom.predict(X_test) # extraRandom_acc += accuracy_score(y_test, xrand_given_labels) # # for gradient boosting # gradientBoosting_clf = GradientBoostingClassifier(n_estimators=350, learning_rate=.1,max_depth=1) # gradientBoosting_clf.fit(X_train, y_train) # gradientBoosting_given_labels = gradientBoosting_clf.predict(X_test) # gradientBoosting_acc += accuracy_score(y_test, gradientBoosting_given_labels) # # for svm # svm_clf = svm.SVC(C= 0.1 , kernel='linear') # svm_clf.fit(X_train, y_train) # svm_given_labels = svm_clf.predict(X_test) # svm_acc += accuracy_score(y_test, svm_given_labels) ",No,5,76.0 "#feature selection # from sklearn.feature_selection import SelectKBest # from sklearn.feature_selection import chi2 # select = SelectKBest(chi2, k=6) # train = select.fit_transform(train, train_labels) # test = select.transform(test) # from sklearn.feature_selection import VarianceThreshold # sel = VarianceThreshold(threshold=0.1) # selFeature=sel.fit_transform(train)",No,5,76.0 "# #linearSVM (1) # linearSVM_clf = svm.SVC(kernel='linear', C=1).fit(train_normalized,train_labels) # #acc1=cross_val_score(clf, train_normalized, train_labels, cv=20, scoring='accuracy') # trainpred=linearSVM_clf.predict(train_normalized) # testpred=linearSVM_clf.predict(test_normalized) # print(metrics.accuracy_score(train_labels, trainpred)) # # print(acc1) # # print(np.mean(acc1)) # #results.append(clf.predict(test_normalized)) ",No,5,76.0 "# #rbfSVM (2) # rbfSVM_clf = svm.SVC(kernel='rbf', C=1).fit(train_normalized,train_labels) # #rbfSVM_acc=cross_val_score(rbfSVM_clf, train_normalized, train_labels, cv=20, scoring='accuracy') # trainpred=rbfSVM_clf.predict(train_normalized) # testpred=rbfSVM_clf.predict(test_normalized) # print(metrics.accuracy_score(train_labels, trainpred)) # # print(rbfSVM_acc) # # print(np.mean(rbfSVM_acc)) # #results.append(rbfSVM_clf.predict(test_normalized)) ",No,5,76.0 "# #logReg (5) # from sklearn.linear_model import LogisticRegression # logReg= LogisticRegression().fit(train_normalized,train_labels) # trainpred=logReg.predict(train_normalized) # testpred=logReg.predict(test_normalized) # print(metrics.accuracy_score(train_labels, trainpred)) # # logreg_acc=cross_val_score(logReg,train_normalized,train_labels,cv=10,scoring='accuracy') # # print(logreg_acc) # # print(np.mean(logreg_acc)) # #results.append(logReg.predict(test_normalized))",No,5,76.0 "# #NearestCentroid (6) # from sklearn.neighbors.nearest_centroid import NearestCentroid # NC_clf = NearestCentroid() # NC_clf.fit(train_normalized, train_labels) # trainpred=NC_clf.predict(train_normalized) # testpred=NC_clf.predict(test_normalized) # print(metrics.accuracy_score(train_labels, trainpred)) # # NC_acc=cross_val_score(NC_clf,train_normalized,train_labels,cv=10,scoring='accuracy') # # print(logreg_acc) # # print(np.mean(logreg_acc)) # #results.append(logReg.predict(test_normalized))",No,5,76.0 "# #SGD (11) # from sklearn.linear_model import SGDClassifier # sgd_clf = SGDClassifier(loss=""hinge"", penalty=""l2"").fit(train_normalized,train_labels) # trainpred=clf.predict(train_normalized) # print(metrics.accuracy_score(train_labels, trainpred)) # results.append(sgd_clf.predict(test_normalized)) ",No,5,76.0 "# #LDA (13) # from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # LDA_clf = LinearDiscriminantAnalysis().fit(train_normalized,train_labels) # LDA_clf.fit(train_normalized,train_labels) # trainpred=LDA_clf.predict(train_normalized) # testpred=LDA_clf.predict(test_normalized) # print(metrics.accuracy_score(train_labels, trainpred)) # # #LDA_acc = cross_val_score(LDA_clf, train_normalized, train_labels, cv=10, scoring='accuracy') # # print(LDA_acc) # # print(np.mean(LDA_acc)) # # resultLDA = LDA_clf.predict(test_normalized)",No,5,76.0 "# #GaussianNB (14) # from sklearn.naive_bayes import GaussianNB # GNB_clf = GaussianNB() # GNB_clf.fit(train_normalized,train_labels) # trainpred=GNB_clf.predict(train_normalized) # testpred=GNB_clf.predict(test_normalized) # print(metrics.accuracy_score(train_labels, trainpred)) # # #GNB_acc = cross_val_score(GNB_clf, train_normalized, train_labels, cv=10, scoring='accuracy') # # print(GNB_acc) # # print(np.mean(GNB_acc))",No,5,76.0 "# svmodel = svm.SVC(C=1, kernel=""poly"") # svmodel.fit(main, labels)",No,5,76.0 "# sklearn.metrics.accuracy_score(labels, svmodel.predict(features))",No,5,76.0 "# sklearn.metrics.accuracy_score(tlabels, svmodel.predict(sfeatures))",No,5,76.0 "# cols = { \'PlayerID\': [i+901 for i in range(440)] , \'TARGET_5Yrs\': svmodel.predict(test_principalComponenta) } # submission = pd.DataFrame(cols) # submission.to_csv(""submission.csv"", index=False) # submission",No,5,76.0 "inputf = features # frst1 = IsolationForest(n_estimators=5) # frst1.fit(inputf, labels) # frst2 = IsolationForest(n_estimators=5) # frst2.fit(inputf, labels) # frst3 = IsolationForest(n_estimators=5) # frst3.fit(inputf, labels) # frst4 = IsolationForest(n_estimators=5) # frst4.fit(inputf, labels) # frst5 = IsolationForest(n_estimators=5) # frst5.fit(inputf, labels) # frst6 = IsolationForest(n_estimators=5) # frst6.fit(inputf, labels) # frst7 = IsolationForest(n_estimators=5) # frst7.fit(inputf, labels) # frst8 = IsolationForest(n_estimators=5) # frst8.fit(inputf, labels) # frst9 = IsolationForest(n_estimators=5) # frst9.fit(inputf, labels) # frst10 = IsolationForest(n_estimators=5) # frst10.fit(inputf, labels)",Yes,4,76.0 "testf = sfeatures # pred1 = frst1.predict(testf) # pred2 = frst2.predict(testf) # pred3 = frst3.predict(testf) # pred4 = frst4.predict(testf) # pred5 = frst5.predict(testf) # pred6 = frst6.predict(testf) # pred7 = frst7.predict(testf) # pred8 = frst8.predict(testf) # pred9 = frst9.predict(testf) # pred10 = frst10.predict(st_sfeatures)",Yes,4,76.0 "# res1 = [] # for i1,i2,i3,i4,i5,i6,i7,i8,i9,i10 in zip(pred1, pred2, pred3, pred4, pred5, pred6, pred7, pred8, pred9, pred10): # j = np.sum([i1,i2,i3,i4,i5,i6,i7,i8,i9,i10]) # if j >= 5: # res1.append(1) # else: # res1.append(0)",No,5,76.0 "# from keras.preprocessing import image # from os import walk # data=[] # input_file_names=[] # #####get the file names of the images to read them one by one # for (dirpath, dirnames, filenames) in walk(""../input/dogs-vs-cats-redux-kernels-edition/train""): # input_file_names=filenames # for x in input_file_names: # img_file_name=x##getting name of the image file # path=str(""../input/train/""+img_file_name)####making proper path of the image file # i=image.load_img(path)####reading the image from the path # i=i.resize((64,64))#####resizing the image # iarray=image.img_to_array(i)####converting it to arrau # data.append(iarray)#####appending the image to the list ",No,5,76.0 # plt.imshow(data[5]),No,5,76.0 "# data=np.array(data) # ####generating labels for the data # labels=[] # for x in input_file_names: # if x.find(""cat"")>=0: # labels.append(0) # else: # labels.append(1) # ###checking if the labels are properly tagged or not,both the classes have equal images 12500 each # a=np.array(labels) # np.unique(a,return_counts=True) # ###reshaping the labels # labels=a.reshape(25000,1)",No,5,76.0 "# #####rescaling the data # data=data/255. ",No,5,76.0 "# model=Sequential() # model.add(Conv2D(64,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100),input_shape=(64,64,3) )) # model.add(Conv2D(64,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100))) # model.add(BatchNormalization()) # model.add(Activation(""relu"")) # model.add(MaxPooling2D((2,2))) # model.add(Conv2D(128,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100))) # model.add(Conv2D(128,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100))) # model.add(BatchNormalization()) # model.add(Activation(""relu"")) # model.add(MaxPooling2D((2,2))) # model.add(Conv2D(256,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100))) # model.add(Conv2D(256,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100))) # model.add(Conv2D(256,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100))) # model.add(BatchNormalization()) # model.add(Activation(""relu"")) # model.add(MaxPooling2D((2,2))) # model.add(Conv2D(512,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100))) # model.add(Conv2D(512,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100))) # model.add(Conv2D(512,kernel_size=(2,2),strides=(1,1),kernel_initializer=glorot_normal(seed=100))) # model.add(BatchNormalization()) # model.add(Activation(""relu"")) # model.add(MaxPooling2D((2,2))) # model.add(Flatten()) # model.add(Dense(10,activation=""relu"",kernel_initializer=glorot_normal(seed=100))) # model.add(Dense(1,activation=""sigmoid"",kernel_initializer=glorot_normal(seed=100))) ",No,5,76.0 "###train test split # from sklearn.model_selection import train_test_split # train_x,test_x,train_y,test_y=train_test_split(data,labels,test_size=0.2,random_state=100) ",No,5,76.0 # train_y.shape,No,5,76.0 "####compiling the model # o=optimizers.adam() # model.compile(loss=""binary_crossentropy"",metrics=[""accuracy""],optimizer=o) ",No,5,76.0 "####fitting the model # H=model.fit(train_x,train_y,epochs=16,validation_split=0.2) ",No,5,76.0 "# plt.plot(range(1,17),H.history[""acc""]) # plt.plot(range(1,17),H.history[""val_acc""]) ",No,5,76.0 "#####making predictions on the test data # preds=model.predict_classes(test_x)",No,5,76.0 # sum(preds==test_y)/len(test_y),No,5,76.0 "# test_data=[] # input_test_file_names=[] # #####get the file names of the images to read them one by one # for (dirpath, dirnames, filenames) in walk(""../input/test""): # input_test_file_names=filenames # for x in input_test_file_names: # img_file_name=x##getting name of the image file # path=str(""../input/test/""+img_file_name)####making proper path of the image file # i=image.load_img(path)####reading the image from the path # i=i.resize((64,64))#####resizing the image # iarray=image.img_to_array(i)####converting it to arrau # test_data.append(iarray)#####appending the image to the list",No,5,76.0 "# test_data=np.array(test_data) # test_data=test_data/255.",No,5,76.0 "# test_preds=model.predict(test_data) # test_preds=test_preds.reshape(len(test_preds))",No,5,76.0 "####as per the submission file rule only numerical part from the file wwas needed ####like 3090.jpg should be saved in as 3090 # new_input_test_file_names=[] # for x in input_test_file_names: # k=int(x[0:x.find("".jpg"")]) # new_input_test_file_names.append(k)",No,5,76.0 "# df=pd.DataFrame({'id':new_input_test_file_names, # 'label':test_preds})",No,5,76.0 "# df.to_csv(""submission.csv"",index=False)",No,5,76.0 "# from keras.applications import resnet50 # from keras.preprocessing.image import ImageDataGenerator # r=resnet50.ResNet50(weights='imagenet',include_top=False,input_shape=(197,197,3))",No,5,76.0 "#########33getting data in in sahpe of (197,197,3) as min reqrmnt of resnet 50 # from keras.preprocessing import image # from os import walk # data=[] # input_file_names=[] # #####get the file names of the images to read them one by one # for (dirpath, dirnames, filenames) in walk(""../input/dogs-vs-cats-redux-kernels-edition/train/""): # input_file_names=filenames # rand_imgs_indexes=random.sample(range(0, 24999), 14000) # new_input_file_names=[] # ######taking only 20000 random images # for k in rand_imgs_indexes: # new_input_file_names.append(input_file_names[k]) # for x in new_input_file_names: # img_file_name=x##getting name of the image file # path=str(""../input/dogs-vs-cats-redux-kernels-edition/train/""+img_file_name)####making proper path of the image file # i=image.load_img(path)####reading the image from the path # i=i.resize((197,197))#####resizing the image # iarray=image.img_to_array(i)####converting it to arrau # iarray=iarray/255. # data.append(iarray)#####appending the image to the list",No,5,76.0 "# data=np.array(data) # ####generating labels for the data # labels=[] # for x in new_input_file_names: # if x.find(""cat"")>=0: # labels.append(0) # else: # labels.append(1) # ###reshaping the labels # a=np.array(labels) # labels=a.reshape(14000,1)",No,5,76.0 "# #########defining the new model by defining my own last layer # new_model=r.output # new_model=Flatten()(new_model) # new_model=Dense(10)(new_model) # new_model=Activation(""relu"")(new_model) # new_model=Dense(1,activation=""sigmoid"")(new_model) # final_model=Model(input=r.input,output=new_model) ",No,5,76.0 "###freezin all layers except from last 3 layers # total_layers=len(final_model.layers) # print(total_layers) # for x in range(0,total_layers-4): # final_model.layers[x].trainable=False # final_model.layers",No,5,76.0 "##checking if the layers have been frozen or not # for x in range(0,total_layers): # print(final_model.layers[x]) # print(final_model.layers[x].trainable)",No,5,76.0 "###train test split # from sklearn.model_selection import train_test_split # train_x,test_x,train_y,test_y=train_test_split(data,labels,test_size=0.2,random_state=100) ",No,5,76.0 "####compiling the model # o=optimizers.adam() # final_model.compile(loss=""binary_crossentropy"",metrics=[""accuracy""],optimizer=o)",No,5,76.0 "# final_model.fit(train_x,train_y,epochs=2,validation_split=0.2)",No,5,76.0 "# predicted_test=final_model.predict(train_x) ",No,5,76.0 ##########################trying vgg19 model###################################,No,5,76.0 "# final_model.save_weights(""vgg_19.h5"")",No,5,76.0 #lrf= learn.lr_find(),No,5,76.0 #learn.sched.plot_lr(),No,5,76.0 #learn.sched.plot(),No,5,76.0 #learn.save('model1'),No,5,76.0 #learn.save('model2'),No,5,76.0 #learn.load('model2'),No,5,76.0 #learn.save('model3'),No,5,76.0 "#tmpk= log_preds #tmpk= log_preds[:,:,0] #tmpk=tmpk.reshape(tmpk.shape[1],tmpk.shape[0])",No,5,76.0 #tmpk= [np.mean(i) for i in tmpk],No,5,76.0 #tmpk= [ np.exp(i) for i in tmpk],No,5,76.0 "# model 2 mean 5.006981e-01 std 4.964828e-01 min 1.536870e-09 [0.025943222, 0.9912974683544303]",No,5,76.0 "#model 1 mean 5.014179e-01 std 4.955358e-01 min 2.701442e-08 [0.026336912, 0.991495253164557]",No,5,76.0 "# from sklearn.ensemble import RandomForestRegressor # rfr = RandomForestRegressor(n_estimators=300, criterion='mae', max_depth=12, n_jobs=-1, verbose=True) # rfr.fit(X_train.values, np.log(y_train.values) + 1) # y_hat = rfr.predict(X_test.values) # y_hat = np.exp(y_hat) - 1 # print(f'MAE: {mae(y_test, y_hat)}') # print(f'RMSPE: {rmspe(y_hat, y_test)}')",No,5,76.0 "# params = {'colsample_bytree': 0.7000000000000001, # 'eta': 0.625, # 'gamma': 0.8, # 'max_depth': 6, # 'eval_metric': 'rmse', # 'min_child_weight': 6.0, # 'n_estimators': 8.0, # 585 # 'silent': 1, # 'subsample': 0.9500000000000001} # watchlist = [(xtrain, 'train'), (xtest, 'eval')] # num_round = 10000 # xgb_regressor = xgb.train(params, xtrain, num_round, watchlist, feval=rmspe_xg, # verbose_eval=10, early_stopping_rounds=50)",No,5,76.0 "# fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(20, 20)); # xgb.plot_importance(xgb_regressor, axes)",No,5,76.0 "# print(""Validating"") # train_probs = xgb_regressor.predict(xtest) # indices = train_probs < 0 # train_probs[indices] = 0 # error = rmspe(np.exp(train_probs) - 1, y_test.values) # print(\'error\', error) # xgb_regressor = xgb.train(params, xtest, 1000, feval=rmspe_xg, xgb_model=xgb_regressor)",No,5,76.0 # print(best_opts),No,5,76.0 "#PMR3508 - Tarefa1 Adult #hash: PMR3508-2018-d59e43f3c1",No,5,76.0 # This is based on Fujisan's kernel on Invasive Species Monitoring,No,5,76.0 "# Grid Search is an ideal candidate for distributed machine learning # Pseudo code for hyperparameters Grid Search ''' from sklearn.grid_search import ParameterGrid param_grid = {'epochs': [5, 10, 15], 'steps_per_epoch' : [10, 20, 50]} grid = ParameterGrid(param_grid) # Accumulate history of all permutations (may be for viewing trend) and keep watching for lowest val_loss as final model for params in grid: print(params) '''",No,5,76.0 "# Normalizar las imgenes (1pt) #x_train = x_train.reshape([1712, 96*96])/255 #x_val = x_val.reshape([428, 96*96])/255 #x_train[0] #valores entre 0 y 1, usara una capa de batchnormalization en la red #Se realiz esto en iteraciones previas, el resultado fue peor, se decide no scalar a [0,1] ni utilizar batch normalization",No,5,76.0 "#labels_axis =np.array([['_x','_y']]) #labels_axis = np.repeat(labels_axis,26745,axis=0).flatten() #labels_axis.shape",No,5,76.0 "#labels= np.core.defchararray.add(labels_area, labels_axis) #labels.shape",No,5,76.0 "#ImageId = np.arange(1,1784) #ImageId =np.repeat(ImageId, 30) #ImageId.shape",No,5,76.0 "#RowId=np.int32(np.arange(1,53491)) #RowId.shape",No,5,76.0 "#sub = np.array([RowId,ImageId,labels,results]) #sub = np.swapaxes(sub,0,1) #sub.shape",No,5,76.0 "#sub_df = pd.DataFrame(data=sub,columns=['RowId','ImageId','FeatureName','Location']) #sub_df.ImageId = pd.to_numeric(sub_df.ImageId) ",No,5,76.0 #sub_df[(sub_df['FeatureName'] == 'left_eye_center_x') & (sub_df['ImageId'] == 1)],No,5,76.0 "# model = Sequential() # model.add(Conv2D(filters = 32, kernel_size = (5,5), padding = 'Same', activation = 'linear', input_shape = (96, 96, 1))) # model.add(LeakyReLU(alpha=.001)) # model.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) # model.add(Dropout(0.7)) # model.add(Flatten()) # model.add(Dense(256, activation = 'linear')) # model.add(LeakyReLU(alpha=.001)) # model.add(Dropout(0.7)) # model.add(Dense(128, activation = 'linear')) # model.add(LeakyReLU(alpha=.001)) # model.add(Dropout(0.7)) # model.add(Dense(30))",No,5,76.0 "# model2 = Sequential() # model2.add(Conv2D(filters = 64, kernel_size = (4,4), padding = 'Same', activation = 'linear', input_shape = (96, 96, 1))) # model2.add(LeakyReLU(alpha=.001)) # model2.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) # model2.add(Dropout(0.5)) # model2.add(Flatten()) # model2.add(Dense(256, activation = 'linear')) # model2.add(LeakyReLU(alpha=.001)) # model2.add(Dropout(0.7)) # model2.add(Dense(30))",No,5,76.0 "# model3 = Sequential() # model3.add(Conv2D(filters = 128, kernel_size = (5,5), padding = 'Same', activation = 'linear', input_shape = (96, 96, 1))) # model3.add(LeakyReLU(alpha=.001)) # model3.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) # model3.add(Dropout(0.5)) # model3.add(Flatten()) # model3.add(Dense(256, activation = 'linear')) # model3.add(LeakyReLU(alpha=.001)) # model3.add(Dropout(0.5)) # model3.add(Dense(128, activation = 'linear')) # model3.add(LeakyReLU(alpha=.001)) # model3.add(Dropout(0.7)) # model3.add(Dense(30))",No,5,76.0 "# np.random.seed(777) # model4 = Sequential() # model4.add(Conv2D(filters = 64, kernel_size = (5,5), padding = 'Same', activation = 'elu', input_shape = (96, 96, 1))) # model4.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) # model4.add(Dropout(0.3)) # model4.add(Conv2D(filters = 64, kernel_size = (5,5), padding = 'Same', activation = 'elu')) # model4.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) # model4.add(Dropout(0.5)) # model4.add(Flatten()) # model4.add(Dense(128, activation = 'relu')) # model4.add(Dropout(0.5)) # model4.add(Dense(30, activation = 'linear'))",No,5,76.0 "# np.random.seed(777) # model5 = Sequential() # model5.add(Conv2D(filters = 32, kernel_size = (4,4), padding = 'Same', activation = 'relu', input_shape = (96, 96, 1))) # model5.add(Conv2D(filters = 64, kernel_size = (4,4), padding = 'Same', activation = 'relu')) # model5.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) # model5.add(Dropout(0.25)) # model5.add(Flatten()) # model5.add(Dense(256, activation = 'relu')) # model5.add(Dropout(0.5)) # model5.add(Dense(128, activation = 'relu')) # model5.add(Dropout(0.7)) # model5.add(Dense(30))",No,5,76.0 "# np.random.seed(777) # model6 = Sequential() # model6.add(Conv2D(filters = 64, kernel_size = (5,5), padding = 'Same', activation = 'elu', input_shape = (96, 96, 1))) # model6.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) # model6.add(Dropout(0.3)) # model6.add(Conv2D(filters = 32, kernel_size = (4,4), padding = 'Same', activation = 'elu')) # model6.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) # model6.add(Dropout(0.5)) # model6.add(Flatten()) # model6.add(Dense(256, activation = 'elu')) # model6.add(Dropout(0.5)) # model6.add(Dense(128, activation = 'relu')) # model6.add(Dropout(0.7)) # model6.add(Dense(30))",No,5,76.0 "# np.random.seed(777) # model7 = Sequential() # model7.add(Conv2D(filters = 64, kernel_size = (5,5), padding = 'Same', activation = 'relu', input_shape = (96, 96, 1))) # model7.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) # model7.add(Dropout(0.5)) # model7.add(Conv2D(filters = 32, kernel_size = (4,4), padding = 'Same', activation = 'relu')) # model7.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) # model7.add(Dropout(0.5)) # model7.add(Flatten()) # model7.add(Dense(128, activation = 'relu')) # model7.add(Dropout(0.7)) # model7.add(Dense(30))",No,5,76.0 "# np.random.seed(777) # model8 = Sequential() # model8.add(Conv2D(filters = 64, kernel_size = (6,6), padding = 'Same', activation = 'relu', input_shape = (96, 96, 1))) # model8.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) # model8.add(Dropout(0.3)) # model8.add(Conv2D(filters = 32, kernel_size = (4,4), padding = 'Same', activation = 'relu')) # model8.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) # model8.add(Dropout(0.5)) # model8.add(Flatten()) # model8.add(Dense(256, activation = 'relu')) # model8.add(Dropout(0.5)) # model8.add(Dense(128, activation = 'relu')) # model8.add(Dense(30))",No,5,76.0 "# np.random.seed(777) # model9 = Sequential() # model9.add(Conv2D(filters = 128, kernel_size = (5,5), padding = 'Same', activation = 'linear', input_shape = (96, 96, 1))) # model9.add(LeakyReLU(alpha=.001)) # model9.add(MaxPool2D(pool_size=(2,2), strides=(2,2))) # model9.add(Dropout(0.5)) # model9.add(Flatten()) # model9.add(Dense(256, activation = 'linear')) # model9.add(LeakyReLU(alpha=.001)) # model9.add(Dropout(0.5)) # model9.add(Dense(128, activation = 'linear')) # model9.add(LeakyReLU(alpha=.001)) # model9.add(Dropout(0.7)) # model9.add(Dense(30))",No,5,76.0 "# model.compile(optimizer = 'adam', loss = 'mse', metrics = ['mae', 'accuracy'])",No,5,76.0 "# model2.compile(optimizer = 'adam', loss = 'mse', metrics = ['mae', 'accuracy'])",No,5,76.0 "# model3.compile(optimizer = 'adam', loss = 'mse', metrics = ['mae', 'accuracy'])",No,5,76.0 "# model4.compile(optimizer = 'adam', loss = 'mse', metrics = ['mae', 'accuracy'])",No,5,76.0 "# model5.compile(loss='mse', optimizer='adam', metrics = ['accuracy'])",No,5,76.0 "# model6.compile(loss='mse', optimizer='adam', metrics = ['accuracy'])",No,5,76.0 "# model7.compile(loss='mse', optimizer='adam', metrics = ['accuracy'])",No,5,76.0 "# model8.compile(loss='mse', optimizer='adam', metrics = ['accuracy'])",No,5,76.0 "# model9.compile(loss='mse', optimizer='adam', metrics = ['accuracy'])",No,5,76.0 "# time1 = time.time() # model.fit(x, y, epochs=100, batch_size=128, validation_split=0.2) # time2 = time.time() # print('Learning Finished!') # chk_processting_time(time1, time2)",No,5,76.0 "# time1 = time.time() # model2.fit(x, y, epochs=100, batch_size=100, validation_split=0.3) # time2 = time.time() # print('Learning Finished!') # chk_processting_time(time1, time2)",No,5,76.0 "# time1 = time.time() # model3.fit(x, y, epochs=100, batch_size=100, validation_split=0.2) # time2 = time.time() # print('Learning Finished!') # chk_processting_time(time1, time2)",No,5,76.0 "# time1 = time.time() # model4.fit(x, y, epochs=100, batch_size=100, validation_split=0.25) # time2 = time.time() # print('Learning Finished!') # chk_processting_time(time1, time2)",No,5,76.0 "# time1 = time.time() # history = model5.fit(x, y, validation_split=0.3, epochs=100, batch_size=100, verbose=1, callbacks=[early_stopping_callback, checkpointer]) # time2 = time.time() # print('Learning Finished!') # chk_processting_time(time1, time2)",No,5,76.0 "# time1 = time.time() # history = model6.fit(x, y, validation_split=0.3, epochs=100, batch_size=100, verbose=1, callbacks=[early_stopping_callback, checkpointer]) # time2 = time.time() # print('Learning Finished!') # chk_processting_time(time1, time2)",No,5,76.0 "# time1 = time.time() # history = model7.fit(x, y, validation_split=0.3, epochs=100, batch_size=100, verbose=1, callbacks=[early_stopping_callback, checkpointer]) # time2 = time.time() # print('Learning Finished!') # chk_processting_time(time1, time2)",No,5,76.0 "# time1 = time.time() # history = model8.fit(x, y, validation_split=0.3, epochs=100, batch_size=100, verbose=1, callbacks=[early_stopping_callback, checkpointer]) # time2 = time.time() # print('Learning Finished!') # chk_processting_time(time1, time2)",No,5,76.0 "# time1 = time.time() # history = model9.fit(x, y, validation_split=0.2, epochs=100, batch_size=100, verbose=1, callbacks=[early_stopping_callback, checkpointer]) # time2 = time.time() # print('Learning Finished!') # chk_processting_time(time1, time2)",No,5,76.0 # !pip install kaggle,No,5,76.0 # !pip show kaggle,No,5,76.0 # !kaggle config path,No,5,76.0 # ! kaggle competitions submit -c facial-keypoints-detection -f predict.csv -m'submission,No,5,76.0 # !kaggle competitions submissions -c facial-keypoints-detection,No,5,76.0 "'''df=pd.get_dummies(df,columns=[ 'Month', 'District'],drop_first=True) df=pd.get_dummies(df,columns=[ 'Year'],drop_first=True) #df_test=pd.get_dummies(df_test,columns=['DayOfWeek','PdDistrict','Year','Month','Day','Hour','Minute'],drop_first=True) ''' ",No,5,76.0 """""""df_test=pd.get_dummies(df_test,columns=[ \'Month\', \'District\'],drop_first=True) df_test=pd.get_dummies(df_test,columns=[ \'Year\'],drop_first=True)""""""",No,5,76.0 "#df=df[['Hour', 'Day', 'Month', 'Year', 'Address', # 'District','X','radial60','Intersection']]",No,5,76.0 "#df=pd.get_dummies(df,columns=[ 'Hour'],drop_first=True) ",No,5,76.0 "#df_test=pd.get_dummies(df_test,columns=[ 'Hour'],drop_first=True)",No,5,76.0 #df.columns.nunique(),No,5,76.0 #df_test.columns.nunique(),No,5,76.0 "'''import lightgbm as lgb model5= lgb.LGBMClassifier(objective='multiclass') model5.fit(X_train,y_train) y_final=model5.predict_proba(X_test) print (log_loss(y_test,y_final));'''",No,5,76.0 "#print (log_loss(y_test,y_pred));",No,5,76.0 """""""temp = data[\'Category\'] le.fit_transform(temp) le.classes_ """"""",No,5,76.0 "# y_pred= pd.DataFrame(y_pred, index=Id_test,columns = le.classes_) ",No,5,76.0 "#from sklearn.linear_model import LogisticRegression #weight={Address:3,District:3,X:1,Day:2} #weight={LARCENY/THEFT:35} #classifier = LogisticRegression(penalty='l1',random_state = 0,class_weight='balanced',multi_class='multinomial', solver='saga',n_jobs=-1) #classifier = LogisticRegression(random_state=0, penalty='l1',multi_class='multinomial', solver='saga' ) #classifier.fit(X_train[0:50000],y_train[0:50000])",No,5,76.0 #y_pred=model.predict_proba(X_test),No,5,76.0 """""""from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(algorithm=\'auto\', leaf_size=60, metric=\'minkowski\', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights=\'uniform\') knn.fit(X_train[0:100000], y_train[0:100000]) """""" ",No,5,76.0 #y_pred=knn.predict_proba(X_test),No,5,76.0