Best Python code snippet using molecule_python
ml.py
Source:ml.py
1import hashlib2import matplotlib.pyplot as plt3import numpy as np4import pandas as pd5from sklearn import svm6from sklearn.ensemble import AdaBoostClassifier7from sklearn.ensemble import ExtraTreesClassifier8from sklearn.ensemble import GradientBoostingClassifier9from sklearn.ensemble import RandomForestClassifier10from sklearn.linear_model import LogisticRegression11from sklearn.linear_model import SGDClassifier12from sklearn.metrics import *13from sklearn.metrics import roc_curve, auc, confusion_matrix14from sklearn.model_selection import ParameterGrid15from sklearn.naive_bayes import GaussianNB16from sklearn.neighbors import KNeighborsClassifier17from sklearn.pipeline import Pipeline18from sklearn.tree import DecisionTreeClassifier19import categorical_encoder as cat_encode20MODELS_TO_RUN = ['RF', 'DT', 'LR', 'SVM', 'KNN']21FEATURES = ['platform', 'visitorType', 'CategoryID']22PATH_RESULTS = "./results/"23PATH_IMAGES = "./images/"24# Simple undersampling of the majority class to ensure balanced data set used for training/validation25def under_sampling(df: pd.DataFrame, response_col: str):26 df_one = df.loc[df[response_col] == 1]27 df_zero = df.loc[df[response_col] == 0]28 if len(df_one) < len(df_zero):29 df_zero = df_zero.sample(n=len(df_one))30 else:31 df_one = df_one.sample(n=len(df_zero))32 return pd.concat([df_zero, df_one])33def prepare_input(df: pd.DataFrame):34 """35 Prepare key inputs for the later steps of the pipeline36 Input:37 - df: pandas dataframe38 Output:39 - train data transformed, test data transformed, original train data, original test data,40 categorical pipeline41 """42 df['NewID'] = df.index43 train_set, test_set = split_train_test_by_id(df, 0.3, 'NewID')44 cat_attribs = [FEATURES[0], FEATURES[1], FEATURES[2]]45 cat_pipeline = Pipeline([46 ('selector', cat_encode.DataFrameSelector(cat_attribs)),47 ('cat_encoder', cat_encode.CategoricalEncoder(encoding="onehot-dense")),48 ])49 train_set_num = train_set[FEATURES]50 train_prepared = cat_pipeline.fit_transform(train_set_num)51 test_set_num = test_set[FEATURES]52 test_prepared = cat_pipeline.transform(test_set_num)53 return train_prepared, test_prepared, train_set, test_set, cat_pipeline54def find_best_model(df: pd.DataFrame, grid_size: str, outcome_var: str, file_name=None):55 """56 Use grid search to find best model57 Input:58 - df: pandas dataframe59 - grid_size: one of 3 possible values: 'test', 'small', 'large'60 - outcome_var: the outcome variable61 - file_name: file name of the csv file containing the results62 Output:63 - either return a dataframe or save results as csv file64 """65 clfs, grid = define_clfs_params(grid_size)66 # define models to run67 models_to_run = MODELS_TO_RUN68 # call clf_loop and store results in results_df69 train_prepared, test_prepared, train_set, test_set, cat_pipeline = prepare_input(df)70 results_df = clf_loop(models_to_run, clfs, grid, train_prepared, test_prepared,71 train_set[outcome_var], test_set[outcome_var])72 # save to csv73 if file_name:74 file_name = PATH_RESULTS + file_name75 results_df.to_csv(file_name, index=False)76 else:77 return results_df78# Calculate AUC score for 1-feature decision tree as baseline results79def baseline_model(df: pd.DataFrame, outcome_var: str):80 train_prepared, test_prepared, train_set, test_set, cat_pipeline = prepare_input(df)81 dec_tree = DecisionTreeClassifier(max_depth=1, min_samples_split=10)82 y_pred_probs = dec_tree.fit(train_prepared, train_set[outcome_var]).predict_proba(test_prepared)[:, 1]83 print("AUC score of 1-feature decision tree: " + str(roc_auc_score(test_set[outcome_var], y_pred_probs)))84# Fit the best model and plot ROC graph85def fit_random_forest(df: pd.DataFrame, outcome_var: str):86 train_prepared, test_prepared, train_set, test_set, cat_pipeline = prepare_input(df)87 model = RandomForestClassifier(max_depth=5, max_features='sqrt', min_samples_split=2,88 n_estimators=10, n_jobs=-1)89 model.fit(train_prepared, train_set[outcome_var])90 model_preds = model.predict_proba(test_prepared)91 prob_true = model_preds[::, 1]92 plot_roc("RandomForest", prob_true, test_set[outcome_var], "save")93# Code from line 109 to line 321 is adapted from Rayid Ghani's github: https://github.com/rayidghani/magicloops94# Plot the ROC curve95def plot_roc(name, probs, true, output_type):96 fpr, tpr, thresholds = roc_curve(true, probs)97 roc_auc = auc(fpr, tpr)98 plt.clf()99 plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)100 plt.plot([0, 1], [0, 1], 'k--')101 plt.xlim([0.0, 1.05])102 plt.ylim([0.0, 1.05])103 plt.xlabel('False Positive Rate')104 plt.ylabel('True Positive Rate')105 plt.title(name)106 plt.legend(loc="lower right")107 if output_type == 'save':108 plt.savefig(PATH_IMAGES + name + '_roc.png')109 else:110 plt.show()111# Generate binary prediction at a specified cutoff point defined as k percent of the sample112# Only apply to y sorted113def generate_binary_at_k(y_scores, k):114 cutoff_index = int(len(y_scores) * (k / 100.0))115 predictions_binary = [1 if x < cutoff_index else 0 for x in range(len(y_scores))]116 return predictions_binary117def joint_sort_descending(l1, l2):118 # l1 and l2 have to be numpy arrays119 idx = np.argsort(l1)[::-1]120 return l1[idx], l2[idx]121# Calculate precision at k122def precision_at_k(y_true, y_scores, k):123 y_scores_sorted, y_true_sorted = joint_sort_descending(np.array(y_scores), np.array(y_true))124 preds_at_k = generate_binary_at_k(y_scores_sorted, k)125 precision = precision_score(y_true_sorted, preds_at_k)126 return precision127# Calculate recall at k128def recall_at_k(y_true, y_scores, k):129 y_scores_sorted, y_true_sorted = joint_sort_descending(np.array(y_scores), np.array(y_true))130 preds_at_k = generate_binary_at_k(y_scores_sorted, k)131 recall = recall_score(y_true_sorted, preds_at_k)132 return recall133# Create confusion matrix134def create_confusion_matrix(y_true, y_scores, k):135 y_scores_sorted, y_true_sorted = joint_sort_descending(np.array(y_scores), np.array(y_true))136 preds_at_k = generate_binary_at_k(y_scores_sorted, k)137 table = confusion_matrix(y_true_sorted, preds_at_k)138 return table139# Plot precision recall curve140def plot_precision_recall_n(y_true, y_prob, model_name, output_type):141 from sklearn.metrics import precision_recall_curve142 y_score = y_prob143 precision_curve, recall_curve, pr_thresholds = precision_recall_curve(y_true, y_score)144 precision_curve = precision_curve[:-1]145 recall_curve = recall_curve[:-1]146 pct_above_per_thresh = []147 number_scored = len(y_score)148 for value in pr_thresholds:149 num_above_thresh = len(y_score[y_score >= value])150 pct_above_thresh = num_above_thresh / float(number_scored)151 pct_above_per_thresh.append(pct_above_thresh)152 pct_above_per_thresh = np.array(pct_above_per_thresh)153 plt.clf()154 fig, ax1 = plt.subplots()155 ax1.plot(pct_above_per_thresh, precision_curve, 'b')156 ax1.set_xlabel('percent of population')157 ax1.set_ylabel('precision', color='b')158 ax2 = ax1.twinx()159 ax2.plot(pct_above_per_thresh, recall_curve, 'r')160 ax2.set_ylabel('recall', color='r')161 ax1.set_ylim([0, 1])162 ax1.set_ylim([0, 1])163 ax2.set_xlim([0, 1])164 name = model_name165 plt.title(name)166 if output_type == 'save':167 plt.savefig(name)168 elif output_type == 'show':169 plt.show()170 else:171 plt.show()172def define_clfs_params(grid_size):173 """Define defaults for different classifiers.174 Define three types of grids:175 Test: for testing your code176 Small: small grid177 Large: Larger grid that has a lot more parameter sweeps178 """179 clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),180 'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),181 'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),182 'LR': LogisticRegression(penalty='l1', C=1e5),183 'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),184 'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),185 'NB': GaussianNB(),186 'DT': DecisionTreeClassifier(),187 'SGD': SGDClassifier(loss="hinge", penalty="l2"),188 'KNN': KNeighborsClassifier(n_neighbors=3)189 }190 large_grid = {191 'RF': {'n_estimators': [1, 10, 100, 1000, 10000], 'max_depth': [1, 5, 10, 20, 50, 100],192 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, 10], 'n_jobs': [-1]},193 'LR': {'penalty': ['l1', 'l2'], 'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]},194 'SGD': {'loss': ['hinge', 'log', 'perceptron'], 'penalty': ['l2', 'l1', 'elasticnet']},195 'ET': {'n_estimators': [1, 10, 100, 1000, 10000], 'criterion': ['gini', 'entropy'],196 'max_depth': [1, 5, 10, 20, 50, 100], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, 10],197 'n_jobs': [-1]},198 'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1, 10, 100, 1000, 10000]},199 'GB': {'n_estimators': [1, 10, 100, 1000, 10000], 'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.5],200 'subsample': [0.1, 0.5, 1.0], 'max_depth': [1, 3, 5, 10, 20, 50, 100]},201 'NB': {},202 'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1, 5, 10, 20, 50, 100], 'min_samples_split': [2, 5, 10]},203 'SVM': {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear']},204 'KNN': {'n_neighbors': [1, 5, 10, 25, 50, 100], 'weights': ['uniform', 'distance'],205 'algorithm': ['auto', 'ball_tree', 'kd_tree']}206 }207 small_grid = {208 'RF': {'n_estimators': [10, 100], 'max_depth': [5, 50], 'max_features': ['sqrt', 'log2'],209 'min_samples_split': [2, 10], 'n_jobs': [-1]},210 'LR': {'penalty': ['l1', 'l2'], 'C': [0.00001, 0.001, 0.1, 1, 10], 'solver': ['liblinear']},211 'SGD': {'loss': ['log', 'perceptron'], 'penalty': ['l2', 'l1', 'elasticnet']},212 'ET': {'n_estimators': [10, 100], 'criterion': ['gini', 'entropy'], 'max_depth': [5, 50],213 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 10], 'n_jobs': [-1]},214 'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1, 10, 100, 1000, 10000]},215 'GB': {'n_estimators': [10, 100], 'learning_rate': [0.001, 0.1, 0.5], 'subsample': [0.1, 0.5, 1.0],216 'max_depth': [5, 50]},217 'NB': {},218 'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1, 5, 10, 20, 50, 100], 'min_samples_split': [2, 5, 10]},219 'SVM': {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], 'kernel': ['linear']},220 'KNN': {'n_neighbors': [1, 5, 10, 25, 50, 100], 'weights': ['uniform', 'distance'],221 'algorithm': ['auto', 'ball_tree', 'kd_tree']}222 }223 test_grid = {224 'RF': {'n_estimators': [1], 'max_depth': [1], 'max_features': ['sqrt'], 'min_samples_split': [10]},225 'LR': {'penalty': ['l1'], 'C': [0.01], 'solver': ['liblinear']},226 'SGD': {'loss': ['perceptron'], 'penalty': ['l2']},227 'ET': {'n_estimators': [1], 'criterion': ['gini'], 'max_depth': [1], 'max_features': ['sqrt'],228 'min_samples_split': [10]},229 'AB': {'algorithm': ['SAMME'], 'n_estimators': [1]},230 'GB': {'n_estimators': [1], 'learning_rate': [0.1], 'subsample': [0.5], 'max_depth': [1]},231 'NB': {},232 'DT': {'criterion': ['gini'], 'max_depth': [1], 'min_samples_split': [10]},233 'SVM': {'C': [1], 'kernel': ['linear']},234 'KNN': {'n_neighbors': [5], 'weights': ['uniform'], 'algorithm': ['auto']}235 }236 if grid_size == 'large':237 return clfs, large_grid238 elif grid_size == 'small':239 return clfs, small_grid240 elif grid_size == 'test':241 return clfs, test_grid242 else:243 return 0, 0244def clf_loop(models_to_run, clfs, grid, X_train, X_test, y_train, y_test):245 """Runs the loop using models_to_run, clfs, gridm and the data246 """247 results_df = pd.DataFrame(columns=('model_type', 'clf', 'parameters', 'auc-roc', 'r_at_5', 'r_at_10', 'r_at_20',248 'r_at_30', 'r_at_35', 'r_at_40', 'p_at_5', 'p_at_10', 'p_at_20', 'p_at_30',249 'p_at_35', 'p_at_40'))250 for n in range(1, 2):251 for index, clf in enumerate([clfs[x] for x in models_to_run]):252 print(models_to_run[index])253 parameter_values = grid[models_to_run[index]]254 for p in ParameterGrid(parameter_values):255 try:256 clf.set_params(**p)257 y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1]258 # you can also store the model, feature importances, and prediction scores259 # we're only storing the metrics for now260 y_pred_probs_sorted, y_test_sorted = zip(*sorted(zip(y_pred_probs, y_test), reverse=True))261 results_df.loc[len(results_df)] = [models_to_run[index], clf, p,262 roc_auc_score(y_test, y_pred_probs),263 recall_at_k(y_test_sorted, y_pred_probs_sorted, 5.0),264 recall_at_k(y_test_sorted, y_pred_probs_sorted, 10.0),265 recall_at_k(y_test_sorted, y_pred_probs_sorted, 20.0),266 recall_at_k(y_test_sorted, y_pred_probs_sorted, 30.0),267 recall_at_k(y_test_sorted, y_pred_probs_sorted, 35.0),268 recall_at_k(y_test_sorted, y_pred_probs_sorted, 40.0),269 precision_at_k(y_test_sorted, y_pred_probs_sorted, 5.0),270 precision_at_k(y_test_sorted, y_pred_probs_sorted, 10.0),271 precision_at_k(y_test_sorted, y_pred_probs_sorted, 20.0),272 precision_at_k(y_test_sorted, y_pred_probs_sorted, 30.0),273 precision_at_k(y_test_sorted, y_pred_probs_sorted, 35.0),274 precision_at_k(y_test_sorted, y_pred_probs_sorted, 40.0)]275 except IndexError as e:276 print('Error:', e)277 continue278 return results_df279# Create training and test set: code taken from Aurelien Geron's github https://github.com/ageron/handson-ml280def test_set_check(identifier, test_ratio, hash):281 return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio282def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):283 ids = data[id_column]284 in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))285 return data.loc[~in_test_set], data.loc[in_test_set]286def main():287 df = pd.read_csv('./data/ECommerceDataSet.csv')288 # Remove outliers289 df = df.loc[df['SessionRevenue'] != 500000]290 # Apply under sampling291 df = under_sampling(df, 'CVR')292 # Find the best model293 find_best_model(df, 'small', 'CVR', 'ml_results_after_undersampling.csv')294 # Fit the best model, in this case, random forest model295 fit_random_forest(df, 'CVR')296 # Find AUC for 1-feature decision tree as the baseline results297 baseline_model(df, 'CVR')298if __name__ == "__main__":...
project1.py
Source:project1.py
1# -*- coding: utf-8 -*-2"""Copy of Copy 44 Untitled1.ipynb3Automatically generated by Colaboratory.4Original file is located at5 https://colab.research.google.com/drive/1UsAVBsFGTXnNG0vWps5m_3Cht6QQqZeG6# **[call all the basic libraries:]**7"""8import pandas as pd9import numpy as np10import matplotlib.pyplot as plt11import seaborn as sns12from sklearn.preprocessing import LabelEncoder , OneHotEncoder13from sklearn.pipeline import Pipeline14from sklearn.preprocessing import StandardScaler15from sklearn.compose import ColumnTransformer16from sklearn.linear_model import LinearRegression17from sklearn.tree import DecisionTreeRegressor18from sklearn.ensemble import RandomForestRegressor19from sklearn.neighbors import KNeighborsRegressor20from xgboost import XGBRegressor21from sklearn.model_selection import train_test_split22from pandas.plotting import scatter_matrix23from sklearn.impute import SimpleImputer24from sklearn.base import BaseEstimator, TransformerMixin25from sklearn.model_selection import cross_val_score26from sklearn.metrics import mean_squared_error27from sklearn.model_selection import GridSearchCV28"""# **start reading the training and testing data sets:**"""29train_df=pd.read_csv('/content/drive/MyDrive/train.csv')30train_df.head()31train_df.info()32dim=pd.read_csv('/content/drive/MyDrive/train.csv')33dim.info()34test_df=pd.read_csv('/content/drive/MyDrive/test.csv')35test_df.head()36test_df.info()37mean_d = dim['price'].mean()38print("Mean Value of Diamonds: $", mean_d)39dim.describe()40dim.describe(include='object')41dim.nunique()42"""# 3. **visualize** the data"""43dim.hist(figsize=(18,10))44sns.pairplot(dim, y_vars='price')45sns.pairplot(dim)46dim["cut"].value_counts() / len(dim)47corr_matrix = dim.corr()48corr_matrix49corr_matrix['price'].sort_values(ascending=False)50plt.figure(figsize = (12,8))51corr_matrix['price'].sort_values(ascending = False).plot(kind = 'bar')52plt.figure(figsize = (16,5))53heato=sns.heatmap(corr_matrix ,cmap='BrBG' ,annot=True )54heato.set_title('Correlation Heatmap', fontdict={'fontsize':25})55dim.plot.scatter(x='carat', y='price' ,figsize=(10,5))56dim.plot.scatter(x='z', y='price' ,figsize=(10,5))57d.plot.scatter(x='z', y='price' ,figsize=(10,5))58input_cat_columns = dim.select_dtypes(include = ['object']).columns.to_list()59for col in input_cat_columns:60 sns.catplot(x=col, y="price", kind="box", dodge=False, height = 5, aspect = 3,data=dim);61"""# **Removing the outliers:**62"""63Q1=dim['depth'].quantile(0.25)64Q3=dim['depth'].quantile(0.75)65IQR=Q3-Q166idx=~((dim['depth']<(Q1 - 1.5*IQR)) | (dim['depth'] >(Q3 + 1.5*IQR)))67d1=dim[idx]68d1.info()69Q1x=dim['x'].quantile(0.25)70Q3x=dim['x'].quantile(0.75)71IQRx=Q3x-Q1x72idxx=(d1['x']>(Q1x - 1.5*IQRx)) & (d1['x'] <(Q3x + 1.5*IQRx))73dx=d1[idxx]74dx.info()75Q1y=dim['y'].quantile(0.25)76Q3y=dim['y'].quantile(0.75)77IQRy=Q3y-Q1y78idxy=(dx['x']>(Q1y - 1.5*IQRy)) & (dx['x'] <(Q3y + 1.5*IQRy))79dy=dx[idxy]80dy.info()81Q1z=dim['z'].quantile(0.25)82Q3z=dim['z'].quantile(0.75)83IQRz=Q3z-Q1z84idxz=(dy['z']>(Q1z - 1.5*IQRz)) & (dy['z'] <(Q3z + 1.5*IQRz))85dz=dy[idxz]86dz.describe()87# dz.info()88Q1ca=dim['carat'].quantile(0.25)89Q3ca=dim['carat'].quantile(0.75)90IQRca=Q3-Q191idxca=(dz['carat']>(Q1ca - 1.5*IQRca)) & (dz['x'] <(Q3ca + 1.5*IQRca))92dca=dz[idx]93dca.info()94Q1ta=dim['table'].quantile(0.25)95Q3ta=dim['table'].quantile(0.75)96IQRta=Q3ta-Q1ta97idxta=(dca['table']>(Q1ta - 1.5*IQRta)) & (dca['x'] <(Q3ta + 1.5*IQRta))98d=dca[idxta]99d.info()100dix=dim.drop('price',axis=1)101diy=dim['price']102x_train, x_test , y_train , y_test = train_test_split(dix, diy, test_size=0.25 , random_state=42)103# def prepare_data(df):104# num_attribs=df.select_dtypes(include=[np.number]).columns.to_list()105# num_pipeline = Pipeline([('std_scaler', StandardScaler())])106# cat_attribs = ["color","clarity","cut"]107# full_pipeline = ColumnTransformer([108# ("num", num_pipeline, num_attribs),109# ("cat", OneHotEncoder(), cat_attribs),110# ])111# data_prepared = full_pipeline.fit_transform(df)112num_attribs=x_train.select_dtypes(include=[np.number]).columns.to_list()113num_pipeline = Pipeline([('std_scaler', StandardScaler())])114cat_attribs = ["color","clarity","cut"]115full_pipeline = ColumnTransformer([116("num", num_pipeline, num_attribs),117("cat", OneHotEncoder(), cat_attribs),118])119train_prepared = full_pipeline.fit_transform(x_train)120test_prepared= full_pipeline.fit_transform(x_test)121# full_pipeline = ColumnTransformer([122# ("num", num_pipeline, num_attribs),123# ("cat", OneHotEncoder(), cat_attribs),124# ])125# cat_cols= d.select_dtypes(include='object').columns.to_list()126# dim1=pd.get_dummies(d , columns=cat_cols , drop_first=True)127# x=dim1.drop('price', axis=1)128# y=dim1['price']129# x_train, x_test , y_train , y_test = train_test_split(x,y, test_size=0.3 , random_state=42)130"""# **start Selecting and Training some Models**131**1. LinearRegression model**132"""133from sklearn.linear_model import LinearRegression134lin_reg = LinearRegression()135lin_reg.fit(train_prepared, y_train)136# some_data_prepared = full_pipeline.transform(some_data)137# print("Predictions:", lin_reg.predict(some_data_prepared))138# print("Labels:", list(some_labels))139dim1_predictions = lin_reg.predict(test_prepared)140lin_mse = mean_squared_error(y_test, dim1_predictions)141lin_rmse = np.sqrt(lin_mse)142lin_rmse143"""*** Using Cross-Validation***"""144lin_scores = cross_val_score(lin_reg, train_prepared, y_train, scoring="neg_mean_squared_error", cv=10)145lin_rmse_scores = np.sqrt(-lin_scores)146print("Scores: ", lin_rmse_scores)147print("Mean: ", lin_rmse_scores.mean())148print("Standard Deviation: ", lin_rmse_scores.std())149"""**2. Decision Tree Regressor model**150"""151tree_reg = DecisionTreeRegressor()152tree_reg.fit(train_prepared, y_train)153dimtree_predictions = tree_reg.predict(test_prepared)154tree_mse = mean_squared_error(y_test, dimtree_predictions)155tree_rmse = np.sqrt(tree_mse)156tree_rmse157"""*** Using Cross-Validation***"""158scores = cross_val_score(tree_reg, train_prepared, y_train, scoring="neg_mean_squared_error", cv=10)159tree_rmse_scores = np.sqrt(-scores)160print("Scores: ", tree_rmse_scores)161print("Mean: ", tree_rmse_scores.mean())162print("Standard Deviation: ", tree_rmse_scores.std())163"""**3. RandomForest Model:**"""164rand_for= RandomForestRegressor()165rand_for.fit(train_prepared, y_train)166ranfor_predictions = tree_reg.predict(test_prepared)167tree_mse = mean_squared_error(y_test, ranfor_predictions)168tree_rmse = np.sqrt(tree_mse)169tree_rmse170"""## Evaluation Models Using Cross-Validation171"""172forest_scores = cross_val_score(rand_for ,train_prepared, y_train,scoring = "neg_mean_squared_error", cv = 10)173forest_rmse_scores = np.sqrt(-forest_scores)174print("Scores: ", forest_rmse_scores)175print("Mean: ", forest_rmse_scores.mean())176print("Standard Deviation: ", forest_rmse_scores.std())177final_test=full_pipeline.fit_transform(test_df)178predictions=pd.Series(rand_for.predict(test_prepared))179pred=pd.DataFrame({'Id': test_df['Id'], 'price': predictions})180pred181"""# **Final Tune using Grid Search:**"""182from sklearn.model_selection import GridSearchCV183param_grid = [184 {'n_estimators': [3,10,30], 'max_features':[2,4,6,8]},185 {'bootstrap':[False], 'max_features':[2,3,4],'n_estimators':[3,10]}186]187fore_reg = RandomForestRegressor(random_state = 42)188grid_search = GridSearchCV(fore_reg, param_grid,cv = 5, scoring = 'neg_mean_squared_error',return_train_score = True)189grid_search.fit(train_prepared, y_train)190cvres = grid_search.cv_results_191for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):192 print(np.sqrt(-mean_score), params)193feature_importances = grid_search.best_estimator_.feature_importances_194feature_importances195final_model = grid_search.best_estimator_196final_model197final_predictions = final_model.predict(test_prepared)198final_mse = mean_squared_error(y_test, final_predictions)199final_rmse = np.sqrt(final_mse)200final_rmse201final_predictions=pd.Series(rand_for.predict(test_prepared))...
__init__.py
Source:__init__.py
1from unittest import TestSuite2import test_doctests, test_prepared, test_equality, test_geomseq, test_xy3import test_collection, test_emptiness, test_singularity, test_validation4def test_suite():5 suite = TestSuite()6 suite.addTest(test_doctests.test_suite())7 suite.addTest(test_prepared.test_suite())8 suite.addTest(test_emptiness.test_suite())9 suite.addTest(test_equality.test_suite())10 suite.addTest(test_geomseq.test_suite())11 suite.addTest(test_xy.test_suite())12 suite.addTest(test_collection.test_suite())13 suite.addTest(test_singularity.test_suite())14 suite.addTest(test_validation.test_suite())...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!