Best Python code snippet using slash
_tree.py
Source:_tree.py
...77 See `Tree explainer examples <https://shap.readthedocs.io/en/latest/api_examples/explainers/Tree.html>`_78 """79 if feature_names is not None:80 self.data_feature_names=feature_names81 elif safe_isinstance(data, "pandas.core.frame.DataFrame"):82 self.data_feature_names = list(data.columns)83 masker = data84 super(Tree, self).__init__(model, masker, feature_names=feature_names)85 if type(self.masker) is maskers.Independent:86 data = self.masker.data87 elif masker is not None:88 raise Exception("Unsupported masker type: %s!" % str(type(self.masker)))89 if getattr(self.masker, "clustering", None) is not None:90 raise Exception("TreeExplainer does not support clustered data inputs! Please use shap.Explainer or pass an unclustered masker!")91 # check for deprecated options92 if model_output == "margin":93 warnings.warn("model_output = \"margin\" has been renamed to model_output = \"raw\"")94 model_output = "raw"95 if model_output == "logloss":96 warnings.warn("model_output = \"logloss\" has been renamed to model_output = \"log_loss\"")97 model_output = "log_loss"98 if "feature_dependence" in deprecated_options:99 dep_val = deprecated_options["feature_dependence"]100 if dep_val == "independent" and feature_perturbation == "interventional":101 warnings.warn("feature_dependence = \"independent\" has been renamed to feature_perturbation" \102 " = \"interventional\"! See GitHub issue #882.")103 elif feature_perturbation != "interventional":104 warnings.warn("feature_dependence = \"independent\" has been renamed to feature_perturbation" \105 " = \"interventional\", you can't supply both options! See GitHub issue #882.")106 if dep_val == "tree_path_dependent" and feature_perturbation == "interventional":107 raise Exception("The feature_dependence option has been renamed to feature_perturbation! " \108 "Please update the option name before calling TreeExplainer. See GitHub issue #882.")109 if feature_perturbation == "independent":110 raise Exception("feature_perturbation = \"independent\" is not a valid option value, please use " \111 "feature_perturbation = \"interventional\" instead. See GitHub issue #882.")112 if safe_isinstance(data, "pandas.core.frame.DataFrame"):113 self.data = data.values114 elif isinstance(data, DenseData):115 self.data = data.data116 else:117 self.data = data118 if self.data is None:119 feature_perturbation = "tree_path_dependent"120 #warnings.warn("Setting feature_perturbation = \"tree_path_dependent\" because no background data was given.")121 elif feature_perturbation == "interventional" and self.data.shape[0] > 1000:122 warnings.warn("Passing "+str(self.data.shape[0]) + " background samples may lead to slow runtimes. Consider "123 "using shap.sample(data, 100) to create a smaller background data set.")124 self.data_missing = None if self.data is None else pd.isna(self.data)125 self.feature_perturbation = feature_perturbation126 self.expected_value = None127 self.model = TreeEnsemble(model, self.data, self.data_missing, model_output)128 self.model_output = model_output129 #self.model_output = self.model.model_output # this allows the TreeEnsemble to translate model outputs types by how it loads the model130 if feature_perturbation not in feature_perturbation_codes:131 raise ValueError("Invalid feature_perturbation option!")132 # check for unsupported combinations of feature_perturbation and model_outputs133 if feature_perturbation == "tree_path_dependent":134 if self.model.model_output != "raw":135 raise ValueError("Only model_output=\"raw\" is supported for feature_perturbation=\"tree_path_dependent\"")136 elif data is None:137 raise ValueError("A background dataset must be provided unless you are using feature_perturbation=\"tree_path_dependent\"!")138 if self.model.model_output != "raw":139 if self.model.objective is None and self.model.tree_output is None:140 raise Exception("Model does not have a known objective or output type! When model_output is " \141 "not \"raw\" then we need to know the model's objective or link function.")142 # A bug in XGBoost fixed in v0.81 makes XGBClassifier fail to give margin outputs143 if safe_isinstance(model, "xgboost.sklearn.XGBClassifier") and self.model.model_output != "raw":144 import xgboost145 if LooseVersion(xgboost.__version__) < LooseVersion('0.81'):146 raise RuntimeError("A bug in XGBoost fixed in v0.81 makes XGBClassifier fail to give margin outputs! Please upgrade to XGBoost >= v0.81!")147 # compute the expected value if we have a parsed tree for the cext148 if self.model.model_output == "log_loss":149 self.expected_value = self.__dynamic_expected_value150 elif data is not None:151 try:152 self.expected_value = self.model.predict(self.data).mean(0)153 except ValueError:154 raise Exception("Currently TreeExplainer can only handle models with categorical splits when " \155 "feature_perturbation=\"tree_path_dependent\" and no background data is passed. Please try again using " \156 "shap.TreeExplainer(model, feature_perturbation=\"tree_path_dependent\").")157 if hasattr(self.expected_value, '__len__') and len(self.expected_value) == 1:158 self.expected_value = self.expected_value[0]159 elif hasattr(self.model, "node_sample_weight"):160 self.expected_value = self.model.values[:,0].sum(0)161 if self.expected_value.size == 1:162 self.expected_value = self.expected_value[0]163 self.expected_value += self.model.base_offset164 if self.model.model_output != "raw":165 self.expected_value = None # we don't handle transforms in this case right now...166 # if our output format requires binary classification to be represented as two outputs then we do that here167 if self.model.model_output == "probability_doubled" and self.expected_value is not None:168 self.expected_value = [1-self.expected_value, self.expected_value]169 def __dynamic_expected_value(self, y):170 """ This computes the expected value conditioned on the given label value.171 """172 return self.model.predict(self.data, np.ones(self.data.shape[0]) * y).mean(0)173 def __call__(self, X, y=None, interactions=False, check_additivity=True):174 if safe_isinstance(X, "pandas.core.frame.DataFrame"):175 feature_names = list(X.columns)176 X = X.values177 else:178 feature_names = getattr(self, "data_feature_names", None)179 if not interactions:180 v = self.shap_values(X, y=y, from_call=True, check_additivity=check_additivity)181 output_shape = tuple()182 if type(v) is list:183 output_shape = (len(v),)184 v = np.stack(v, axis=-1) # put outputs at the end185 # the explanation object expects an expected value for each row186 if hasattr(self.expected_value, "__len__"):187 ev_tiled = np.tile(self.expected_value, (v.shape[0],1))188 else:189 ev_tiled = np.tile(self.expected_value, v.shape[0])190 e = Explanation(v, base_values=ev_tiled, data=X, feature_names=feature_names)191 else:192 v = self.shap_interaction_values(X)193 e = Explanation(v, base_values=self.expected_value, data=X, feature_names=feature_names, interaction_order=2)194 return e195 def _validate_inputs(self, X, y, tree_limit, check_additivity):196 # see if we have a default tree_limit in place.197 if tree_limit is None:198 tree_limit = -1 if self.model.tree_limit is None else self.model.tree_limit199 if tree_limit < 0 or tree_limit > self.model.values.shape[0]:200 tree_limit = self.model.values.shape[0]201 # convert dataframes202 if safe_isinstance(X, "pandas.core.series.Series"):203 X = X.values204 elif safe_isinstance(X, "pandas.core.frame.DataFrame"):205 X = X.values206 flat_output = False207 if len(X.shape) == 1:208 flat_output = True209 X = X.reshape(1, X.shape[0])210 if X.dtype != self.model.input_dtype:211 X = X.astype(self.model.input_dtype)212 X_missing = np.isnan(X, dtype=np.bool)213 assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))214 assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!"215 if self.model.model_output == "log_loss":216 assert y is not None, "Both samples and labels must be provided when model_output = " \217 "\"log_loss\" (i.e. `explainer.shap_values(X, y)`)!"218 assert X.shape[0] == len(219 y), "The number of labels (%d) does not match the number of samples to explain (" \220 "%d)!" % (221 len(y), X.shape[0])222 if self.feature_perturbation == "tree_path_dependent":223 assert self.model.fully_defined_weighting, "The background dataset you provided does " \224 "not cover all the leaves in the model, " \225 "so TreeExplainer cannot run with the " \226 "feature_perturbation=\"tree_path_dependent\" option! " \227 "Try providing a larger background " \228 "dataset, or using " \229 "feature_perturbation=\"interventional\"."230 if check_additivity and self.model.model_type == "pyspark":231 warnings.warn(232 "check_additivity requires us to run predictions which is not supported with "233 "spark, "234 "ignoring."235 " Set check_additivity=False to remove this warning")236 check_additivity = False237 return X, y, X_missing, flat_output, tree_limit, check_additivity238 def shap_values(self, X, y=None, tree_limit=None, approximate=False, check_additivity=True, from_call=False):239 """ Estimate the SHAP values for a set of samples.240 Parameters241 ----------242 X : numpy.array, pandas.DataFrame or catboost.Pool (for catboost)243 A matrix of samples (# samples x # features) on which to explain the model's output.244 y : numpy.array245 An array of label values for each sample. Used when explaining loss functions.246 tree_limit : None (default) or int247 Limit the number of trees used by the model. By default None means no use the limit of the248 original model, and -1 means no limit.249 approximate : bool250 Run fast, but only roughly approximate the Tree SHAP values. This runs a method251 previously proposed by Saabas which only considers a single feature ordering. Take care252 since this does not have the consistency guarantees of Shapley values and places too253 much weight on lower splits in the tree.254 check_additivity : bool255 Run a validation check that the sum of the SHAP values equals the output of the model. This256 check takes only a small amount of time, and will catch potential unforeseen errors.257 Note that this check only runs right now when explaining the margin of the model.258 Returns259 -------260 array or list261 For models with a single output this returns a matrix of SHAP values262 (# samples x # features). Each row sums to the difference between the model output for that263 sample and the expected value of the model output (which is stored in the expected_value264 attribute of the explainer when it is constant). For models with vector outputs this returns265 a list of such matrices, one for each output.266 """267 # see if we have a default tree_limit in place.268 if tree_limit is None:269 tree_limit = -1 if self.model.tree_limit is None else self.model.tree_limit270 # shortcut using the C++ version of Tree SHAP in XGBoost, LightGBM, and CatBoost271 if self.feature_perturbation == "tree_path_dependent" and self.model.model_type != "internal" and self.data is None:272 model_output_vals = None273 phi = None274 if self.model.model_type == "xgboost":275 import xgboost276 if not isinstance(X, xgboost.core.DMatrix):277 X = xgboost.DMatrix(X)278 if tree_limit == -1:279 tree_limit = 0280 try:281 phi = self.model.original_model.predict(282 X, ntree_limit=tree_limit, pred_contribs=True,283 approx_contribs=approximate, validate_features=False284 )285 except ValueError as e:286 raise ValueError("This reshape error is often caused by passing a bad data matrix to SHAP. " \287 "See https://github.com/slundberg/shap/issues/580") from e288 if check_additivity and self.model.model_output == "raw":289 model_output_vals = self.model.original_model.predict(290 X, ntree_limit=tree_limit, output_margin=True,291 validate_features=False292 )293 elif self.model.model_type == "lightgbm":294 assert not approximate, "approximate=True is not supported for LightGBM models!"295 phi = self.model.original_model.predict(X, num_iteration=tree_limit, pred_contrib=True)296 # Note: the data must be joined on the last axis297 if self.model.original_model.params['objective'] == 'binary':298 if not from_call:299 warnings.warn('LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray')300 phi = np.concatenate((0-phi, phi), axis=-1)301 if phi.shape[1] != X.shape[1] + 1:302 try:303 phi = phi.reshape(X.shape[0], phi.shape[1]//(X.shape[1]+1), X.shape[1]+1)304 except ValueError as e:305 raise Exception("This reshape error is often caused by passing a bad data matrix to SHAP. " \306 "See https://github.com/slundberg/shap/issues/580") from e307 elif self.model.model_type == "catboost": # thanks to the CatBoost team for implementing this...308 assert not approximate, "approximate=True is not supported for CatBoost models!"309 assert tree_limit == -1, "tree_limit is not yet supported for CatBoost models!"310 import catboost311 if type(X) != catboost.Pool:312 X = catboost.Pool(X, cat_features=self.model.cat_feature_indices)313 phi = self.model.original_model.get_feature_importance(data=X, fstr_type='ShapValues')314 # note we pull off the last column and keep it as our expected_value315 if phi is not None:316 if len(phi.shape) == 3:317 self.expected_value = [phi[0, i, -1] for i in range(phi.shape[1])]318 out = [phi[:, i, :-1] for i in range(phi.shape[1])]319 else:320 self.expected_value = phi[0, -1]321 out = phi[:, :-1]322 if check_additivity and model_output_vals is not None:323 self.assert_additivity(out, model_output_vals)324 return out325 X, y, X_missing, flat_output, tree_limit, check_additivity = self._validate_inputs(X, y,326 tree_limit,327 check_additivity)328 transform = self.model.get_transform()329 # run the core algorithm using the C extension330 assert_import("cext")331 phi = np.zeros((X.shape[0], X.shape[1]+1, self.model.num_outputs))332 if not approximate:333 _cext.dense_tree_shap(334 self.model.children_left, self.model.children_right, self.model.children_default,335 self.model.features, self.model.thresholds, self.model.values, self.model.node_sample_weight,336 self.model.max_depth, X, X_missing, y, self.data, self.data_missing, tree_limit,337 self.model.base_offset, phi, feature_perturbation_codes[self.feature_perturbation],338 output_transform_codes[transform], False339 )340 else:341 _cext.dense_tree_saabas(342 self.model.children_left, self.model.children_right, self.model.children_default,343 self.model.features, self.model.thresholds, self.model.values,344 self.model.max_depth, tree_limit, self.model.base_offset, output_transform_codes[transform],345 X, X_missing, y, phi346 )347 out = self._get_shap_output(phi, flat_output)348 if check_additivity and self.model.model_output == "raw":349 self.assert_additivity(out, self.model.predict(X))350 return out351 # we pull off the last column and keep it as our expected_value352 def _get_shap_output(self, phi, flat_output):353 if self.model.num_outputs == 1:354 if self.expected_value is None and self.model.model_output != "log_loss":355 self.expected_value = phi[0, -1, 0]356 if flat_output:357 out = phi[0, :-1, 0]358 else:359 out = phi[:, :-1, 0]360 else:361 if self.expected_value is None and self.model.model_output != "log_loss":362 self.expected_value = [phi[0, -1, i] for i in range(phi.shape[2])]363 if flat_output:364 out = [phi[0, :-1, i] for i in range(self.model.num_outputs)]365 else:366 out = [phi[:, :-1, i] for i in range(self.model.num_outputs)]367 # if our output format requires binary classificaiton to be represented as two outputs then we do that here368 if self.model.model_output == "probability_doubled":369 out = [-out, out]370 return out371 def shap_interaction_values(self, X, y=None, tree_limit=None):372 """ Estimate the SHAP interaction values for a set of samples.373 Parameters374 ----------375 X : numpy.array, pandas.DataFrame or catboost.Pool (for catboost)376 A matrix of samples (# samples x # features) on which to explain the model's output.377 y : numpy.array378 An array of label values for each sample. Used when explaining loss functions (not yet supported).379 tree_limit : None (default) or int380 Limit the number of trees used by the model. By default None means no use the limit of the381 original model, and -1 means no limit.382 Returns383 -------384 array or list385 For models with a single output this returns a tensor of SHAP values386 (# samples x # features x # features). The matrix (# features x # features) for each sample sums387 to the difference between the model output for that sample and the expected value of the model output388 (which is stored in the expected_value attribute of the explainer). Each row of this matrix sums to the389 SHAP value for that feature for that sample. The diagonal entries of the matrix represent the390 "main effect" of that feature on the prediction and the symmetric off-diagonal entries represent the391 interaction effects between all pairs of features for that sample. For models with vector outputs392 this returns a list of tensors, one for each output.393 """394 assert self.model.model_output == "raw", "Only model_output = \"raw\" is supported for SHAP interaction values right now!"395 #assert self.feature_perturbation == "tree_path_dependent", "Only feature_perturbation = \"tree_path_dependent\" is supported for SHAP interaction values right now!"396 transform = "identity"397 # see if we have a default tree_limit in place.398 if tree_limit is None:399 tree_limit = -1 if self.model.tree_limit is None else self.model.tree_limit400 # shortcut using the C++ version of Tree SHAP in XGBoost401 if self.model.model_type == "xgboost" and self.feature_perturbation == "tree_path_dependent":402 import xgboost403 if not isinstance(X, xgboost.core.DMatrix):404 X = xgboost.DMatrix(X)405 if tree_limit == -1:406 tree_limit = 0407 phi = self.model.original_model.predict(X, ntree_limit=tree_limit, pred_interactions=True, validate_features=False)408 # note we pull off the last column and keep it as our expected_value409 if len(phi.shape) == 4:410 self.expected_value = [phi[0, i, -1, -1] for i in range(phi.shape[1])]411 return [phi[:, i, :-1, :-1] for i in range(phi.shape[1])]412 else:413 self.expected_value = phi[0, -1, -1]414 return phi[:, :-1, :-1]415 X, y, X_missing, flat_output, tree_limit, _ = self._validate_inputs(X, y, tree_limit, False)416 # run the core algorithm using the C extension417 assert_import("cext")418 phi = np.zeros((X.shape[0], X.shape[1]+1, X.shape[1]+1, self.model.num_outputs))419 _cext.dense_tree_shap(420 self.model.children_left, self.model.children_right, self.model.children_default,421 self.model.features, self.model.thresholds, self.model.values, self.model.node_sample_weight,422 self.model.max_depth, X, X_missing, y, self.data, self.data_missing, tree_limit,423 self.model.base_offset, phi, feature_perturbation_codes[self.feature_perturbation],424 output_transform_codes[transform], True425 )426 return self._get_shap_interactions_output(phi,flat_output)427 # we pull off the last column and keep it as our expected_value428 def _get_shap_interactions_output(self, phi, flat_output):429 if self.model.num_outputs == 1:430 self.expected_value = phi[0, -1, -1, 0]431 if flat_output:432 out = phi[0, :-1, :-1, 0]433 else:434 out = phi[:, :-1, :-1, 0]435 else:436 self.expected_value = [phi[0, -1, -1, i] for i in range(phi.shape[3])]437 if flat_output:438 out = [phi[0, :-1, :-1, i] for i in range(self.model.num_outputs)]439 else:440 out = [phi[:, :-1, :-1, i] for i in range(self.model.num_outputs)]441 return out442 def assert_additivity(self, phi, model_output):443 def check_sum(sum_val, model_output):444 diff = np.abs(sum_val - model_output)445 if np.max(diff / (np.abs(sum_val) + 1e-2)) > 1e-2:446 ind = np.argmax(diff)447 err_msg = "Additivity check failed in TreeExplainer! Please ensure the data matrix you passed to the " \448 "explainer is the same shape that the model was trained on. If your data shape is correct " \449 "then please report this on GitHub."450 if self.feature_perturbation != "interventional":451 err_msg += " Consider retrying with the feature_perturbation='interventional' option."452 err_msg += " This check failed because for one of the samples the sum of the SHAP values" \453 " was %f, while the model output was %f. If this difference is acceptable" \454 " you can set check_additivity=False to disable this check." % (sum_val[ind], model_output[ind])455 raise Exception(err_msg)456 if type(phi) is list:457 for i in range(len(phi)):458 check_sum(self.expected_value[i] + phi[i].sum(-1), model_output[:,i])459 else:460 check_sum(self.expected_value + phi.sum(-1), model_output)461 @staticmethod462 def supports_model_with_masker(model, masker):463 """ Determines if this explainer can handle the given model.464 This is an abstract static method meant to be implemented by each subclass.465 """466 if not isinstance(masker, (maskers.Independent)) and masker is not None:467 return False468 try:469 TreeEnsemble(model)470 except:471 return False472 return True473class TreeEnsemble:474 """ An ensemble of decision trees.475 This object provides a common interface to many different types of models.476 """477 def __init__(self, model, data=None, data_missing=None, model_output=None):478 self.model_type = "internal"479 self.trees = None480 self.base_offset = 0481 self.model_output = model_output482 self.objective = None # what we explain when explaining the loss of the model483 self.tree_output = None # what are the units of the values in the leaves of the trees484 self.internal_dtype = np.float64485 self.input_dtype = np.float64 # for sklearn we need to use np.float32 to always get exact matches to their predictions486 self.data = data487 self.data_missing = data_missing488 self.fully_defined_weighting = True # does the background dataset land in every leaf (making it valid for the tree_path_dependent method)489 self.tree_limit = None # used for limiting the number of trees we use by default (like from early stopping)490 self.num_stacked_models = 1 # If this is greater than 1 it means we have multiple stacked models with the same number of trees in each model (XGBoost multi-output style)491 self.cat_feature_indices = None # If this is set it tells us which features are treated categorically492 # we use names like keras493 objective_name_map = {494 "mse": "squared_error",495 "variance": "squared_error",496 "friedman_mse": "squared_error",497 "reg:linear": "squared_error",498 "reg:squarederror": "squared_error",499 "regression": "squared_error",500 "regression_l2": "squared_error",501 "mae": "absolute_error",502 "gini": "binary_crossentropy",503 "entropy": "binary_crossentropy",504 "reg:logistic": "binary_crossentropy",505 "binary:logistic": "binary_crossentropy",506 "binary_logloss": "binary_crossentropy",507 "binary": "binary_crossentropy"508 }509 tree_output_name_map = {510 "regression": "raw_value",511 "regression_l2": "squared_error",512 "reg:linear": "raw_value",513 "reg:squarederror": "raw_value",514 "reg:logistic": "log_odds",515 "binary:logistic": "log_odds",516 "binary_logloss": "log_odds",517 "binary": "log_odds"518 }519 if type(model) is dict and "trees" in model:520 # This allows a dictionary to be passed that represents the model.521 # this dictionary has several numerica paramters and also a list of trees522 # where each tree is a dictionary describing that tree523 if "internal_dtype" in model:524 self.internal_dtype = model["internal_dtype"]525 if "input_dtype" in model:526 self.input_dtype = model["input_dtype"]527 if "objective" in model:528 self.objective = model["objective"]529 if "tree_output" in model:530 self.tree_output = model["tree_output"]531 if "base_offset" in model:532 self.base_offset = model["base_offset"]533 self.trees = [SingleTree(t, data=data, data_missing=data_missing) for t in model["trees"]]534 elif type(model) is list and type(model[0]) == SingleTree: # old-style direct-load format535 self.trees = model536 elif safe_isinstance(model, ["sklearn.ensemble.RandomForestRegressor", "sklearn.ensemble.forest.RandomForestRegressor", "econml.grf._base_grf.BaseGRF"]):537 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"538 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type539 self.input_dtype = np.float32540 scaling = 1.0 / len(model.estimators_) # output is average of trees541 self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]542 self.objective = objective_name_map.get(model.criterion, None)543 self.tree_output = "raw_value"544 elif safe_isinstance(model, ["sklearn.ensemble.IsolationForest", "sklearn.ensemble._iforest.IsolationForest"]):545 self.dtype = np.float32546 scaling = 1.0 / len(model.estimators_) # output is average of trees547 self.trees = [IsoTree(e.tree_, f, scaling=scaling, data=data, data_missing=data_missing) for e, f in zip(model.estimators_, model.estimators_features_)]548 self.tree_output = "raw_value"549 elif safe_isinstance(model, ["pyod.models.iforest.IForest"]):550 self.dtype = np.float32551 scaling = 1.0 / len(model.estimators_) # output is average of trees552 self.trees = [IsoTree(e.tree_, f, scaling=scaling, data=data, data_missing=data_missing) for e, f in zip(model.detector_.estimators_, model.detector_.estimators_features_)]553 self.tree_output = "raw_value"554 elif safe_isinstance(model, "skopt.learning.forest.RandomForestRegressor"):555 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"556 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type557 self.input_dtype = np.float32558 scaling = 1.0 / len(model.estimators_) # output is average of trees559 self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]560 self.objective = objective_name_map.get(model.criterion, None)561 self.tree_output = "raw_value"562 elif safe_isinstance(model, "sklearn.ensemble.AdaBoostRegressor"):563 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"564 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type565 self.input_dtype = np.float32566 scaling = 1.0 / len(model.estimators_) # output is average of trees567 self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]568 self.objective = objective_name_map.get(model.base_estimator_.criterion, None)569 self.tree_output = "raw_value"570 elif safe_isinstance(model, ["sklearn.ensemble.ExtraTreesRegressor", "sklearn.ensemble.forest.ExtraTreesRegressor"]):571 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"572 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type573 self.input_dtype = np.float32574 scaling = 1.0 / len(model.estimators_) # output is average of trees575 self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]576 self.objective = objective_name_map.get(model.criterion, None)577 self.tree_output = "raw_value"578 elif safe_isinstance(model, "skopt.learning.forest.ExtraTreesRegressor"):579 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"580 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type581 self.input_dtype = np.float32582 scaling = 1.0 / len(model.estimators_) # output is average of trees583 self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]584 self.objective = objective_name_map.get(model.criterion, None)585 self.tree_output = "raw_value"586 elif safe_isinstance(model, ["sklearn.tree.DecisionTreeRegressor", "sklearn.tree.tree.DecisionTreeRegressor", "econml.grf._base_grftree.GRFTree"]):587 self.internal_dtype = model.tree_.value.dtype.type588 self.input_dtype = np.float32589 self.trees = [SingleTree(model.tree_, data=data, data_missing=data_missing)]590 self.objective = objective_name_map.get(model.criterion, None)591 self.tree_output = "raw_value"592 elif safe_isinstance(model, ["sklearn.tree.DecisionTreeClassifier", "sklearn.tree.tree.DecisionTreeClassifier"]):593 self.internal_dtype = model.tree_.value.dtype.type594 self.input_dtype = np.float32595 self.trees = [SingleTree(model.tree_, normalize=True, data=data, data_missing=data_missing)]596 self.objective = objective_name_map.get(model.criterion, None)597 self.tree_output = "probability"598 elif safe_isinstance(model, ["sklearn.ensemble.RandomForestClassifier", "sklearn.ensemble.forest.RandomForestClassifier"]):599 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"600 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type601 self.input_dtype = np.float32602 scaling = 1.0 / len(model.estimators_) # output is average of trees603 self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]604 self.objective = objective_name_map.get(model.criterion, None)605 self.tree_output = "probability"606 elif safe_isinstance(model, ["sklearn.ensemble.AdaBoostClassifier", "sklearn.ensemble._weighted_boosting.AdaBoostClassifier"]):607 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"608 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type609 self.input_dtype = np.float32610 scaling = 1.0 / len(model.estimators_) # output is average of trees611 self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling) for e in model.estimators_]612 self.objective = objective_name_map.get(model.base_estimator_.criterion, None) #This line is done to get the decision criteria, for example gini.613 self.tree_output = "probability" #This is the last line added614 elif safe_isinstance(model, ["sklearn.ensemble.ExtraTreesClassifier", "sklearn.ensemble.forest.ExtraTreesClassifier"]): # TODO: add unit test for this case615 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"616 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type617 self.input_dtype = np.float32618 scaling = 1.0 / len(model.estimators_) # output is average of trees619 self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]620 self.objective = objective_name_map.get(model.criterion, None)621 self.tree_output = "probability"622 elif safe_isinstance(model, ["sklearn.ensemble.GradientBoostingRegressor", "sklearn.ensemble.gradient_boosting.GradientBoostingRegressor"]):623 self.input_dtype = np.float32624 # currently we only support the mean and quantile estimators625 if safe_isinstance(model.init_, ["sklearn.ensemble.MeanEstimator", "sklearn.ensemble.gradient_boosting.MeanEstimator"]):626 self.base_offset = model.init_.mean627 elif safe_isinstance(model.init_, ["sklearn.ensemble.QuantileEstimator", "sklearn.ensemble.gradient_boosting.QuantileEstimator"]):628 self.base_offset = model.init_.quantile629 elif safe_isinstance(model.init_, "sklearn.dummy.DummyRegressor"):630 self.base_offset = model.init_.constant_[0]631 else:632 assert False, "Unsupported init model type: " + str(type(model.init_))633 self.trees = [SingleTree(e.tree_, scaling=model.learning_rate, data=data, data_missing=data_missing) for e in model.estimators_[:,0]]634 self.objective = objective_name_map.get(model.criterion, None)635 self.tree_output = "raw_value"636 elif safe_isinstance(model, ["sklearn.ensemble.HistGradientBoostingRegressor"]):637 import sklearn638 if self.model_output == "predict":639 self.model_output = "raw"640 self.input_dtype = sklearn.ensemble._hist_gradient_boosting.common.X_DTYPE641 self.base_offset = model._baseline_prediction642 self.trees = []643 for p in model._predictors:644 nodes = p[0].nodes645 # each node has values: ('value', 'count', 'feature_idx', 'threshold', 'missing_go_to_left', 'left', 'right', 'gain', 'depth', 'is_leaf', 'bin_threshold')646 tree = {647 "children_left": np.array([-1 if n[9] else n[5] for n in nodes]),648 "children_right": np.array([-1 if n[9] else n[6] for n in nodes]),649 "children_default": np.array([-1 if n[9] else (n[5] if n[4] else n[6]) for n in nodes]),650 "features": np.array([-2 if n[9] else n[2] for n in nodes]),651 "thresholds": np.array([n[3] for n in nodes], dtype=np.float64),652 "values": np.array([[n[0]] for n in nodes], dtype=np.float64),653 "node_sample_weight": np.array([n[1] for n in nodes], dtype=np.float64),654 }655 self.trees.append(SingleTree(tree, data=data, data_missing=data_missing))656 self.objective = objective_name_map.get(model.loss, None)657 self.tree_output = "raw_value"658 elif safe_isinstance(model, ["sklearn.ensemble.HistGradientBoostingClassifier"]):659 import sklearn660 self.base_offset = model._baseline_prediction661 if hasattr(self.base_offset, "__len__") and self.model_output != "raw":662 raise Exception("Multi-output HistGradientBoostingClassifier models are not yet supported unless model_output=\"raw\". See GitHub issue #1028")663 self.input_dtype = sklearn.ensemble._hist_gradient_boosting.common.X_DTYPE664 self.num_stacked_models = len(model._predictors[0])665 if self.model_output == "predict_proba":666 if self.num_stacked_models == 1:667 self.model_output = "probability_doubled" # with predict_proba we need to double the outputs to match668 else:669 self.model_output = "probability"670 self.trees = []671 for p in model._predictors:672 for i in range(self.num_stacked_models):673 nodes = p[i].nodes674 # each node has values: ('value', 'count', 'feature_idx', 'threshold', 'missing_go_to_left', 'left', 'right', 'gain', 'depth', 'is_leaf', 'bin_threshold')675 tree = {676 "children_left": np.array([-1 if n[9] else n[5] for n in nodes]),677 "children_right": np.array([-1 if n[9] else n[6] for n in nodes]),678 "children_default": np.array([-1 if n[9] else (n[5] if n[4] else n[6]) for n in nodes]),679 "features": np.array([-2 if n[9] else n[2] for n in nodes]),680 "thresholds": np.array([n[3] for n in nodes], dtype=np.float64),681 "values": np.array([[n[0]] for n in nodes], dtype=np.float64),682 "node_sample_weight": np.array([n[1] for n in nodes], dtype=np.float64),683 }684 self.trees.append(SingleTree(tree, data=data, data_missing=data_missing))685 self.objective = objective_name_map.get(model.loss, None)686 self.tree_output = "log_odds"687 elif safe_isinstance(model, ["sklearn.ensemble.GradientBoostingClassifier","sklearn.ensemble._gb.GradientBoostingClassifier", "sklearn.ensemble.gradient_boosting.GradientBoostingClassifier"]):688 self.input_dtype = np.float32689 # TODO: deal with estimators for each class690 if model.estimators_.shape[1] > 1:691 assert False, "GradientBoostingClassifier is only supported for binary classification right now!"692 # currently we only support the logs odds estimator693 if safe_isinstance(model.init_, ["sklearn.ensemble.LogOddsEstimator", "sklearn.ensemble.gradient_boosting.LogOddsEstimator"]):694 self.base_offset = model.init_.prior695 self.tree_output = "log_odds"696 elif safe_isinstance(model.init_, "sklearn.dummy.DummyClassifier"):697 self.base_offset = scipy.special.logit(model.init_.class_prior_[1]) # with two classes the trees only model the second class. # pylint: disable=no-member698 self.tree_output = "log_odds"699 else:700 assert False, "Unsupported init model type: " + str(type(model.init_))701 self.trees = [SingleTree(e.tree_, scaling=model.learning_rate, data=data, data_missing=data_missing) for e in model.estimators_[:,0]]702 self.objective = objective_name_map.get(model.criterion, None)703 elif "pyspark.ml" in str(type(model)):704 assert_import("pyspark")705 self.model_type = "pyspark"706 # model._java_obj.getImpurity() can be gini, entropy or variance.707 self.objective = objective_name_map.get(model._java_obj.getImpurity(), None)708 if "Classification" in str(type(model)):709 normalize = True710 self.tree_output = "probability"711 else:712 normalize = False713 self.tree_output = "raw_value"714 # Spark Random forest, create 1 weighted (avg) tree per sub-model715 if safe_isinstance(model, "pyspark.ml.classification.RandomForestClassificationModel") \716 or safe_isinstance(model, "pyspark.ml.regression.RandomForestRegressionModel"):717 sum_weight = sum(model.treeWeights) # output is average of trees718 self.trees = [SingleTree(tree, normalize=normalize, scaling=model.treeWeights[i]/sum_weight) for i, tree in enumerate(model.trees)]719 # Spark GBT, create 1 weighted (learning rate) tree per sub-model720 elif safe_isinstance(model, "pyspark.ml.classification.GBTClassificationModel") \721 or safe_isinstance(model, "pyspark.ml.regression.GBTRegressionModel"):722 self.objective = "squared_error" # GBT subtree use the variance723 self.tree_output = "raw_value"724 self.trees = [SingleTree(tree, normalize=False, scaling=model.treeWeights[i]) for i, tree in enumerate(model.trees)]725 # Spark Basic model (single tree)726 elif safe_isinstance(model, "pyspark.ml.classification.DecisionTreeClassificationModel") \727 or safe_isinstance(model, "pyspark.ml.regression.DecisionTreeRegressionModel"):728 self.trees = [SingleTree(model, normalize=normalize, scaling=1)]729 else:730 assert False, "Unsupported Spark model type: " + str(type(model))731 elif safe_isinstance(model, "xgboost.core.Booster"):732 import xgboost733 self.original_model = model734 self.model_type = "xgboost"735 xgb_loader = XGBTreeModelLoader(self.original_model)736 self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)737 self.base_offset = xgb_loader.base_score738 self.objective = objective_name_map.get(xgb_loader.name_obj, None)739 self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)740 if xgb_loader.num_class > 0:741 self.num_stacked_models = xgb_loader.num_class742 elif safe_isinstance(model, "xgboost.sklearn.XGBClassifier"):743 import xgboost744 self.input_dtype = np.float32745 self.model_type = "xgboost"746 self.original_model = model.get_booster()747 xgb_loader = XGBTreeModelLoader(self.original_model)748 self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)749 self.base_offset = xgb_loader.base_score750 self.objective = objective_name_map.get(xgb_loader.name_obj, None)751 self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)752 self.tree_limit = getattr(model, "best_ntree_limit", None)753 if xgb_loader.num_class > 0:754 self.num_stacked_models = xgb_loader.num_class755 if self.model_output == "predict_proba":756 if self.num_stacked_models == 1:757 self.model_output = "probability_doubled" # with predict_proba we need to double the outputs to match758 else:759 self.model_output = "probability"760 elif safe_isinstance(model, "xgboost.sklearn.XGBRegressor"):761 import xgboost762 self.original_model = model.get_booster()763 self.model_type = "xgboost"764 xgb_loader = XGBTreeModelLoader(self.original_model)765 self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)766 self.base_offset = xgb_loader.base_score767 self.objective = objective_name_map.get(xgb_loader.name_obj, None)768 self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)769 self.tree_limit = getattr(model, "best_ntree_limit", None)770 if xgb_loader.num_class > 0:771 self.num_stacked_models = xgb_loader.num_class772 elif safe_isinstance(model, "xgboost.sklearn.XGBRanker"):773 import xgboost774 self.original_model = model.get_booster()775 self.model_type = "xgboost"776 xgb_loader = XGBTreeModelLoader(self.original_model)777 self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)778 self.base_offset = xgb_loader.base_score779 # Note: for ranker, leaving tree_output and objective as None as they780 # are not implemented in native code yet781 self.tree_limit = getattr(model, "best_ntree_limit", None)782 if xgb_loader.num_class > 0:783 self.num_stacked_models = xgb_loader.num_class784 elif safe_isinstance(model, "lightgbm.basic.Booster"):785 assert_import("lightgbm")786 self.model_type = "lightgbm"787 self.original_model = model788 tree_info = self.original_model.dump_model()["tree_info"]789 try:790 self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]791 except:792 self.trees = None # we get here because the cext can't handle categorical splits yet793 self.objective = objective_name_map.get(model.params.get("objective", "regression"), None)794 self.tree_output = tree_output_name_map.get(model.params.get("objective", "regression"), None)795 elif safe_isinstance(model, "gpboost.basic.Booster"):796 assert_import("gpboost")797 self.model_type = "gpboost"798 self.original_model = model799 tree_info = self.original_model.dump_model()["tree_info"]800 try:801 self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]802 except:803 self.trees = None # we get here because the cext can't handle categorical splits yet804 self.objective = objective_name_map.get(model.params.get("objective", "regression"), None)805 self.tree_output = tree_output_name_map.get(model.params.get("objective", "regression"), None)806 elif safe_isinstance(model, "lightgbm.sklearn.LGBMRegressor"):807 assert_import("lightgbm")808 self.model_type = "lightgbm"809 self.original_model = model.booster_810 tree_info = self.original_model.dump_model()["tree_info"]811 try:812 self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]813 except:814 self.trees = None # we get here because the cext can't handle categorical splits yet815 self.objective = objective_name_map.get(model.objective, None)816 self.tree_output = tree_output_name_map.get(model.objective, None)817 if model.objective is None:818 self.objective = "squared_error"819 self.tree_output = "raw_value"820 elif safe_isinstance(model, "lightgbm.sklearn.LGBMRanker"):821 assert_import("lightgbm")822 self.model_type = "lightgbm"823 self.original_model = model.booster_824 tree_info = self.original_model.dump_model()["tree_info"]825 try:826 self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]827 except:828 self.trees = None # we get here because the cext can't handle categorical splits yet829 # Note: for ranker, leaving tree_output and objective as None as they830 # are not implemented in native code yet831 elif safe_isinstance(model, "lightgbm.sklearn.LGBMClassifier"):832 assert_import("lightgbm")833 self.model_type = "lightgbm"834 if model.n_classes_ > 2:835 self.num_stacked_models = model.n_classes_836 self.original_model = model.booster_837 tree_info = self.original_model.dump_model()["tree_info"]838 try:839 self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]840 except:841 self.trees = None # we get here because the cext can't handle categorical splits yet842 self.objective = objective_name_map.get(model.objective, None)843 self.tree_output = tree_output_name_map.get(model.objective, None)844 if model.objective is None:845 self.objective = "binary_crossentropy"846 self.tree_output = "log_odds"847 elif safe_isinstance(model, "catboost.core.CatBoostRegressor"):848 assert_import("catboost")849 self.model_type = "catboost"850 self.original_model = model851 self.cat_feature_indices = model.get_cat_feature_indices()852 elif safe_isinstance(model, "catboost.core.CatBoostClassifier"):853 assert_import("catboost")854 self.model_type = "catboost"855 self.original_model = model856 self.input_dtype = np.float32857 try:858 cb_loader = CatBoostTreeModelLoader(model)859 self.trees = cb_loader.get_trees(data=data, data_missing=data_missing)860 except:861 self.trees = None # we get here because the cext can't handle categorical splits yet862 self.tree_output = "log_odds"863 self.objective = "binary_crossentropy"864 self.cat_feature_indices = model.get_cat_feature_indices()865 elif safe_isinstance(model, "catboost.core.CatBoost"):866 assert_import("catboost")867 self.model_type = "catboost"868 self.original_model = model869 self.cat_feature_indices = model.get_cat_feature_indices()870 elif safe_isinstance(model, "imblearn.ensemble._forest.BalancedRandomForestClassifier"):871 self.input_dtype = np.float32872 scaling = 1.0 / len(model.estimators_) # output is average of trees873 self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]874 self.objective = objective_name_map.get(model.criterion, None)875 self.tree_output = "probability"876 elif safe_isinstance(model, "ngboost.ngboost.NGBoost") or safe_isinstance(model, "ngboost.api.NGBRegressor") or safe_isinstance(model, "ngboost.api.NGBClassifier"):877 assert model.base_models, "The NGBoost model has empty `base_models`! Have you called `model.fit`?"878 if self.model_output == "raw":879 param_idx = 0 # default to the first parameter of the output distribution880 warnings.warn("Translating model_ouput=\"raw\" to model_output=0 for the 0-th parameter in the distribution. Use model_output=0 directly to avoid this warning.")881 elif type(self.model_output) is int:882 param_idx = self.model_output883 self.model_output = "raw" # note that after loading we have a new model_output type884 assert safe_isinstance(model.base_models[0][param_idx], ["sklearn.tree.DecisionTreeRegressor", "sklearn.tree.tree.DecisionTreeRegressor"]), "You must use default_tree_learner!"885 shap_trees = [trees[param_idx] for trees in model.base_models]886 self.internal_dtype = shap_trees[0].tree_.value.dtype.type887 self.input_dtype = np.float32888 scaling = - model.learning_rate * np.array(model.scalings) # output is weighted average of trees889 self.trees = [SingleTree(e.tree_, scaling=s, data=data, data_missing=data_missing) for e,s in zip(shap_trees,scaling)]890 self.objective = objective_name_map.get(shap_trees[0].criterion, None)891 self.tree_output = "raw_value"892 self.base_offset = model.init_params[param_idx]893 else:894 raise Exception("Model type not yet supported by TreeExplainer: " + str(type(model)))895 # build a dense numpy version of all the tree objects896 if self.trees is not None and self.trees:897 max_nodes = np.max([len(t.values) for t in self.trees])898 assert len(np.unique([t.values.shape[1] for t in self.trees])) == 1, "All trees in the ensemble must have the same output dimension!"899 num_trees = len(self.trees)900 if self.num_stacked_models > 1:901 assert len(self.trees) % self.num_stacked_models == 0, "Only stacked models with equal numbers of trees are supported!"902 assert self.trees[0].values.shape[1] == 1, "Only stacked models with single outputs per model are supported!"903 self.num_outputs = self.num_stacked_models904 else:905 self.num_outputs = self.trees[0].values.shape[1]906 # important to be -1 in unused sections!! This way we can tell which entries are valid.907 self.children_left = -np.ones((num_trees, max_nodes), dtype=np.int32)908 self.children_right = -np.ones((num_trees, max_nodes), dtype=np.int32)909 self.children_default = -np.ones((num_trees, max_nodes), dtype=np.int32)910 self.features = -np.ones((num_trees, max_nodes), dtype=np.int32)911 self.thresholds = np.zeros((num_trees, max_nodes), dtype=self.internal_dtype)912 self.values = np.zeros((num_trees, max_nodes, self.num_outputs), dtype=self.internal_dtype)913 self.node_sample_weight = np.zeros((num_trees, max_nodes), dtype=self.internal_dtype)914 for i in range(num_trees):915 self.children_left[i,:len(self.trees[i].children_left)] = self.trees[i].children_left916 self.children_right[i,:len(self.trees[i].children_right)] = self.trees[i].children_right917 self.children_default[i,:len(self.trees[i].children_default)] = self.trees[i].children_default918 self.features[i,:len(self.trees[i].features)] = self.trees[i].features919 self.thresholds[i,:len(self.trees[i].thresholds)] = self.trees[i].thresholds920 if self.num_stacked_models > 1:921 # stack_pos = int(i // (num_trees / self.num_stacked_models))922 stack_pos = i % self.num_stacked_models923 self.values[i,:len(self.trees[i].values[:,0]),stack_pos] = self.trees[i].values[:,0]924 else:925 self.values[i,:len(self.trees[i].values)] = self.trees[i].values926 self.node_sample_weight[i,:len(self.trees[i].node_sample_weight)] = self.trees[i].node_sample_weight927 # ensure that the passed background dataset lands in every leaf928 if np.min(self.trees[i].node_sample_weight) <= 0:929 self.fully_defined_weighting = False930 self.num_nodes = np.array([len(t.values) for t in self.trees], dtype=np.int32)931 self.max_depth = np.max([t.max_depth for t in self.trees])932 # make sure the base offset is a 1D array933 if not hasattr(self.base_offset, "__len__") or len(self.base_offset) == 0:934 self.base_offset = (np.ones(self.num_outputs) * self.base_offset).astype(self.internal_dtype)935 self.base_offset = self.base_offset.flatten()936 assert len(self.base_offset) == self.num_outputs937 def get_transform(self):938 """ A consistent interface to make predictions from this model.939 """940 if self.model_output == "raw":941 transform = "identity"942 elif self.model_output == "probability" or self.model_output == "probability_doubled":943 if self.tree_output == "log_odds":944 transform = "logistic"945 elif self.tree_output == "probability":946 transform = "identity"947 else:948 raise Exception("model_output = \"probability\" is not yet supported when model.tree_output = \"" + self.tree_output + "\"!")949 elif self.model_output == "log_loss":950 if self.objective == "squared_error":951 transform = "squared_loss"952 elif self.objective == "binary_crossentropy":953 transform = "logistic_nlogloss"954 else:955 raise Exception("model_output = \"log_loss\" is not yet supported when model.objective = \"" + self.objective + "\"!")956 else:957 raise Exception("Unrecognized model_output parameter value: %s! If model.%s is a valid function open a github issue to ask that this method be supported. If you want 'predict_proba' just use 'probability' for now." % (str(self.model_output), str(self.model_output)))958 return transform959 def predict(self, X, y=None, output=None, tree_limit=None):960 """ A consistent interface to make predictions from this model.961 Parameters962 ----------963 tree_limit : None (default) or int964 Limit the number of trees used by the model. By default None means no use the limit of the965 original model, and -1 means no limit.966 """967 if output is None:968 output = self.model_output969 if self.model_type == "pyspark":970 #import pyspark971 # TODO: support predict for pyspark972 raise NotImplementedError("Predict with pyspark isn't implemented. Don't run 'interventional' as feature_perturbation.")973 # see if we have a default tree_limit in place.974 if tree_limit is None:975 tree_limit = -1 if self.tree_limit is None else self.tree_limit976 # convert dataframes977 if safe_isinstance(X, "pandas.core.series.Series"):978 X = X.values979 elif safe_isinstance(X, "pandas.core.frame.DataFrame"):980 X = X.values981 flat_output = False982 if len(X.shape) == 1:983 flat_output = True984 X = X.reshape(1, X.shape[0])985 if X.dtype.type != self.input_dtype:986 X = X.astype(self.input_dtype)987 X_missing = np.isnan(X, dtype=np.bool)988 assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))989 assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!"990 if tree_limit < 0 or tree_limit > self.values.shape[0]:991 tree_limit = self.values.shape[0]992 if output == "logloss":993 assert y is not None, "Both samples and labels must be provided when explaining the loss (i.e. `explainer.shap_values(X, y)`)!"994 assert X.shape[0] == len(y), "The number of labels (%d) does not match the number of samples to explain (%d)!" % (len(y), X.shape[0])995 transform = self.get_transform()996 assert_import("cext")997 output = np.zeros((X.shape[0], self.num_outputs))998 _cext.dense_tree_predict(999 self.children_left, self.children_right, self.children_default,1000 self.features, self.thresholds, self.values,1001 self.max_depth, tree_limit, self.base_offset, output_transform_codes[transform],1002 X, X_missing, y, output1003 )1004 # drop dimensions we don't need1005 if flat_output:1006 if self.num_outputs == 1:1007 return output.flatten()[0]1008 else:1009 return output.reshape(-1, self.num_outputs)1010 else:1011 if self.num_outputs == 1:1012 return output.flatten()1013 else:1014 return output1015class SingleTree:1016 """ A single decision tree.1017 The primary point of this object is to parse many different tree types into a common format.1018 """1019 def __init__(self, tree, normalize=False, scaling=1.0, data=None, data_missing=None):1020 assert_import("cext")1021 if safe_isinstance(tree, ["sklearn.tree._tree.Tree", "econml.tree._tree.Tree"]):1022 self.children_left = tree.children_left.astype(np.int32)1023 self.children_right = tree.children_right.astype(np.int32)1024 self.children_default = self.children_left # missing values not supported in sklearn1025 self.features = tree.feature.astype(np.int32)1026 self.thresholds = tree.threshold.astype(np.float64)1027 self.values = tree.value.reshape(tree.value.shape[0], tree.value.shape[1] * tree.value.shape[2])1028 if normalize:1029 self.values = (self.values.T / self.values.sum(1)).T1030 self.values = self.values * scaling1031 self.node_sample_weight = tree.weighted_n_node_samples.astype(np.float64)1032 elif type(tree) is dict and 'features' in tree:1033 self.children_left = tree["children_left"].astype(np.int32)1034 self.children_right = tree["children_right"].astype(np.int32)1035 self.children_default = tree["children_default"].astype(np.int32)1036 self.features = tree["features"].astype(np.int32)1037 self.thresholds = tree["thresholds"]1038 self.values = tree["values"] * scaling1039 self.node_sample_weight = tree["node_sample_weight"]1040 # deprecated dictionary support (with sklearn singlular style "feature" and "value" names)1041 elif type(tree) is dict and 'children_left' in tree:1042 self.children_left = tree["children_left"].astype(np.int32)1043 self.children_right = tree["children_right"].astype(np.int32)1044 self.children_default = tree["children_default"].astype(np.int32)1045 self.features = tree["feature"].astype(np.int32)1046 self.thresholds = tree["threshold"]1047 self.values = tree["value"] * scaling1048 self.node_sample_weight = tree["node_sample_weight"]1049 elif safe_isinstance(tree, "pyspark.ml.classification.DecisionTreeClassificationModel") \1050 or safe_isinstance(tree, "pyspark.ml.regression.DecisionTreeRegressionModel"):1051 #model._java_obj.numNodes() doesn't give leaves, need to recompute the size1052 def getNumNodes(node, size):1053 size = size + 11054 if node.subtreeDepth() == 0:1055 return size1056 else:1057 size = getNumNodes(node.leftChild(), size)1058 return getNumNodes(node.rightChild(), size)1059 num_nodes = getNumNodes(tree._java_obj.rootNode(), 0)1060 self.children_left = np.full(num_nodes, -2, dtype=np.int32)1061 self.children_right = np.full(num_nodes, -2, dtype=np.int32)1062 self.children_default = np.full(num_nodes, -2, dtype=np.int32)1063 self.features = np.full(num_nodes, -2, dtype=np.int32)1064 self.thresholds = np.full(num_nodes, -2, dtype=np.float64)1065 self.values = [-2]*num_nodes1066 self.node_sample_weight = np.full(num_nodes, -2, dtype=np.float64)1067 def buildTree(index, node):1068 index = index + 11069 if tree._java_obj.getImpurity() == 'variance':1070 self.values[index] = [node.prediction()] #prediction for the node1071 else:1072 self.values[index] = [e for e in node.impurityStats().stats()] #for gini: NDarray(numLabel): 1 per label: number of item for each label which went through this node1073 self.node_sample_weight[index] = node.impurityStats().count() #weighted count of element trough this node1074 if node.subtreeDepth() == 0:1075 return index1076 else:1077 self.features[index] = node.split().featureIndex() #index of the feature we split on, not available for leaf, int1078 if str(node.split().getClass()).endswith('tree.CategoricalSplit'):1079 #Categorical split isn't implemented, TODO: could fake it by creating a fake node to split on the exact value?1080 raise NotImplementedError('CategoricalSplit are not yet implemented')1081 self.thresholds[index] = node.split().threshold() #threshold for the feature, not available for leaf, float1082 self.children_left[index] = index + 11083 idx = buildTree(index, node.leftChild())1084 self.children_right[index] = idx + 11085 idx = buildTree(idx, node.rightChild())1086 return idx1087 buildTree(-1, tree._java_obj.rootNode())1088 #default Not supported with mlib? (TODO)1089 self.children_default = self.children_left1090 self.values = np.asarray(self.values)1091 if normalize:1092 self.values = (self.values.T / self.values.sum(1)).T1093 self.values = self.values * scaling1094 elif type(tree) == dict and 'tree_structure' in tree: # LightGBM model dump1095 start = tree['tree_structure']1096 num_parents = tree['num_leaves']-11097 self.children_left = np.empty((2*num_parents+1), dtype=np.int32)1098 self.children_right = np.empty((2*num_parents+1), dtype=np.int32)1099 self.children_default = np.empty((2*num_parents+1), dtype=np.int32)1100 self.features = np.empty((2*num_parents+1), dtype=np.int32)1101 self.thresholds = np.empty((2*num_parents+1), dtype=np.float64)1102 self.values = [-2]*(2*num_parents+1)1103 self.node_sample_weight = np.empty((2*num_parents+1), dtype=np.float64)1104 visited, queue = [], [start]1105 while queue:1106 vertex = queue.pop(0)1107 if 'split_index' in vertex.keys():1108 if vertex['split_index'] not in visited:1109 if 'split_index' in vertex['left_child'].keys():1110 self.children_left[vertex['split_index']] = vertex['left_child']['split_index']1111 else:1112 self.children_left[vertex['split_index']] = vertex['left_child']['leaf_index']+num_parents1113 if 'split_index' in vertex['right_child'].keys():1114 self.children_right[vertex['split_index']] = vertex['right_child']['split_index']1115 else:1116 self.children_right[vertex['split_index']] = vertex['right_child']['leaf_index']+num_parents1117 if vertex['default_left']:1118 self.children_default[vertex['split_index']] = self.children_left[vertex['split_index']]1119 else:1120 self.children_default[vertex['split_index']] = self.children_right[vertex['split_index']]1121 self.features[vertex['split_index']] = vertex['split_feature']1122 self.thresholds[vertex['split_index']] = vertex['threshold']1123 self.values[vertex['split_index']] = [vertex['internal_value']]1124 self.node_sample_weight[vertex['split_index']] = vertex['internal_count']1125 visited.append(vertex['split_index'])1126 queue.append(vertex['left_child'])1127 queue.append(vertex['right_child'])1128 else:1129 self.children_left[vertex['leaf_index']+num_parents] = -11130 self.children_right[vertex['leaf_index']+num_parents] = -11131 self.children_default[vertex['leaf_index']+num_parents] = -11132 self.features[vertex['leaf_index']+num_parents] = -11133 self.children_left[vertex['leaf_index']+num_parents] = -11134 self.children_right[vertex['leaf_index']+num_parents] = -11135 self.children_default[vertex['leaf_index']+num_parents] = -11136 self.features[vertex['leaf_index']+num_parents] = -11137 self.thresholds[vertex['leaf_index']+num_parents] = -11138 self.values[vertex['leaf_index']+num_parents] = [vertex['leaf_value']]1139 self.node_sample_weight[vertex['leaf_index']+num_parents] = vertex['leaf_count']1140 self.values = np.asarray(self.values)1141 self.values = np.multiply(self.values, scaling)1142 elif type(tree) == dict and 'nodeid' in tree:1143 """ Directly create tree given the JSON dump (with stats) of a XGBoost model.1144 """1145 def max_id(node):1146 if "children" in node:1147 return max(node["nodeid"], *[max_id(n) for n in node["children"]])1148 else:1149 return node["nodeid"]1150 m = max_id(tree) + 11151 self.children_left = -np.ones(m, dtype=np.int32)1152 self.children_right = -np.ones(m, dtype=np.int32)1153 self.children_default = -np.ones(m, dtype=np.int32)1154 self.features = -np.ones(m, dtype=np.int32)1155 self.thresholds = np.zeros(m, dtype=np.float64)1156 self.values = np.zeros((m, 1), dtype=np.float64)1157 self.node_sample_weight = np.empty(m, dtype=np.float64)1158 def extract_data(node, tree):1159 i = node["nodeid"]1160 tree.node_sample_weight[i] = node["cover"]1161 if "children" in node:1162 tree.children_left[i] = node["yes"]1163 tree.children_right[i] = node["no"]1164 tree.children_default[i] = node["missing"]1165 tree.features[i] = node["split"]1166 tree.thresholds[i] = node["split_condition"]1167 for n in node["children"]:1168 extract_data(n, tree)1169 elif "leaf" in node:1170 tree.values[i] = node["leaf"] * scaling1171 extract_data(tree, self)1172 elif type(tree) == str:1173 """ Build a tree from a text dump (with stats) of xgboost.1174 """1175 nodes = [t.lstrip() for t in tree[:-1].split("\n")]1176 nodes_dict = {}1177 for n in nodes: nodes_dict[int(n.split(":")[0])] = n.split(":")[1]1178 m = max(nodes_dict.keys())+11179 children_left = -1*np.ones(m,dtype="int32")1180 children_right = -1*np.ones(m,dtype="int32")1181 children_default = -1*np.ones(m,dtype="int32")1182 features = -2*np.ones(m,dtype="int32")1183 thresholds = -1*np.ones(m,dtype="float64")1184 values = 1*np.ones(m,dtype="float64")1185 node_sample_weight = np.zeros(m,dtype="float64")1186 values_lst = list(nodes_dict.values())1187 keys_lst = list(nodes_dict.keys())1188 for i in range(0,len(keys_lst)):1189 value = values_lst[i]1190 key = keys_lst[i]1191 if ("leaf" in value):1192 # Extract values1193 val = float(value.split("leaf=")[1].split(",")[0])1194 node_sample_weight_val = float(value.split("cover=")[1])1195 # Append to lists1196 values[key] = val1197 node_sample_weight[key] = node_sample_weight_val1198 else:1199 c_left = int(value.split("yes=")[1].split(",")[0])1200 c_right = int(value.split("no=")[1].split(",")[0])1201 c_default = int(value.split("missing=")[1].split(",")[0])1202 feat_thres = value.split(" ")[0]1203 if ("<" in feat_thres):1204 feature = int(feat_thres.split("<")[0][2:])1205 threshold = float(feat_thres.split("<")[1][:-1])1206 if ("=" in feat_thres):1207 feature = int(feat_thres.split("=")[0][2:])1208 threshold = float(feat_thres.split("=")[1][:-1])1209 node_sample_weight_val = float(value.split("cover=")[1].split(",")[0])1210 children_left[key] = c_left1211 children_right[key] = c_right1212 children_default[key] = c_default1213 features[key] = feature1214 thresholds[key] = threshold1215 node_sample_weight[key] = node_sample_weight_val1216 self.children_left = children_left1217 self.children_right = children_right1218 self.children_default = children_default1219 self.features = features1220 self.thresholds = thresholds1221 self.values = values[:,np.newaxis] * scaling1222 self.node_sample_weight = node_sample_weight1223 else:1224 raise Exception("Unknown input to SingleTree constructor: " + str(tree))1225 # Re-compute the number of samples that pass through each node if we are given data1226 if data is not None and data_missing is not None:1227 self.node_sample_weight[:] = 0.01228 _cext.dense_tree_update_weights(1229 self.children_left, self.children_right, self.children_default, self.features,1230 self.thresholds, self.values, 1, self.node_sample_weight, data, data_missing1231 )1232 # we compute the expectations to make sure they follow the SHAP logic1233 self.max_depth = _cext.compute_expectations(1234 self.children_left, self.children_right, self.node_sample_weight,1235 self.values1236 )1237class IsoTree(SingleTree):1238 """1239 In sklearn the tree of the Isolation Forest does not calculated in a good way.1240 """1241 def __init__(self, tree, tree_features, normalize=False, scaling=1.0, data=None, data_missing=None):1242 super(IsoTree, self).__init__(tree, normalize, scaling, data, data_missing)1243 if safe_isinstance(tree, "sklearn.tree._tree.Tree"):1244 from sklearn.ensemble._iforest import _average_path_length # pylint: disable=no-name-in-module1245 def _recalculate_value(tree, i , level):1246 if tree.children_left[i] == -1 and tree.children_right[i] == -1:1247 value = level + _average_path_length(np.array([tree.n_node_samples[i]]))[0]1248 self.values[i, 0] = value1249 return value * tree.n_node_samples[i]1250 else:1251 value_left = _recalculate_value(tree, tree.children_left[i] , level + 1)1252 value_right = _recalculate_value(tree, tree.children_right[i] , level + 1)1253 self.values[i, 0] = (value_left + value_right) / tree.n_node_samples[i]1254 return value_left + value_right1255 _recalculate_value(tree, 0, 0)1256 if normalize:1257 self.values = (self.values.T / self.values.sum(1)).T...
base_tree.py
Source:base_tree.py
...75 self.base_offset = model["base_offset"]76 self.trees = [SingleTree(t, data=data, data_missing=data_missing) for t in model["trees"]]77 elif type(model) is list and type(model[0]) == SingleTree: # old-style direct-load format78 self.trees = model79 elif safe_isinstance(model,80 ["sklearn.ensemble.RandomForestRegressor", "sklearn.ensemble.forest.RandomForestRegressor",81 "econml.grf._base_grf.BaseGRF"]):82 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"83 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type84 self.input_dtype = np.float3285 scaling = 1.0 / len(model.estimators_) # output is average of trees86 # self.scaling = scaling87 self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in88 model.estimators_]89 self.objective = objective_name_map.get(model.criterion, None)90 self.tree_output = "raw_value"91 elif safe_isinstance(model, ["sklearn.ensemble.IsolationForest", "sklearn.ensemble._iforest.IsolationForest"]):92 self.dtype = np.float3293 scaling = 1.0 / len(model.estimators_) # output is average of trees94 # self.scaling = scaling95 self.trees = [IsoTree(e.tree_, f, scaling=scaling, data=data, data_missing=data_missing) for e, f in96 zip(model.estimators_, model.estimators_features_)]97 self.tree_output = "raw_value"98 elif safe_isinstance(model, ["pyod.models.iforest.IForest"]):99 self.dtype = np.float32100 scaling = 1.0 / len(model.estimators_) # output is average of trees101 # self.scaling = scaling102 self.trees = [IsoTree(e.tree_, f, scaling=scaling, data=data, data_missing=data_missing) for e, f in103 zip(model.detector_.estimators_, model.detector_.estimators_features_)]104 self.tree_output = "raw_value"105 elif safe_isinstance(model, "skopt.learning.forest.RandomForestRegressor"):106 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"107 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type108 self.input_dtype = np.float32109 scaling = 1.0 / len(model.estimators_) # output is average of trees110 # self.scaling = scaling111 self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in112 model.estimators_]113 self.objective = objective_name_map.get(model.criterion, None)114 self.tree_output = "raw_value"115 elif safe_isinstance(model,116 ["sklearn.ensemble.ExtraTreesRegressor", "sklearn.ensemble.forest.ExtraTreesRegressor"]):117 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"118 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type119 self.input_dtype = np.float32120 scaling = 1.0 / len(model.estimators_) # output is average of trees121 # self.scaling = scaling122 self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in123 model.estimators_]124 self.objective = objective_name_map.get(model.criterion, None)125 self.tree_output = "raw_value"126 elif safe_isinstance(model, "skopt.learning.forest.ExtraTreesRegressor"):127 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"128 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type129 self.input_dtype = np.float32130 scaling = 1.0 / len(model.estimators_) # output is average of trees131 # self.scaling = scaling132 self.trees = [SingleTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in133 model.estimators_]134 self.objective = objective_name_map.get(model.criterion, None)135 self.tree_output = "raw_value"136 elif safe_isinstance(model, ["sklearn.tree.DecisionTreeRegressor", "sklearn.tree.tree.DecisionTreeRegressor",137 "econml.grf._base_grftree.GRFTree"]):138 self.internal_dtype = model.tree_.value.dtype.type139 self.input_dtype = np.float32140 self.trees = [SingleTree(model.tree_, data=data, data_missing=data_missing)]141 self.objective = objective_name_map.get(model.criterion, None)142 self.tree_output = "raw_value"143 # self.scaling = 1144 elif safe_isinstance(model,145 ["sklearn.tree.DecisionTreeClassifier", "sklearn.tree.tree.DecisionTreeClassifier"]):146 self.internal_dtype = model.tree_.value.dtype.type147 self.input_dtype = np.float32148 self.trees = [SingleTree(model.tree_, normalize=True, data=data, data_missing=data_missing)]149 self.objective = objective_name_map.get(model.criterion, None)150 self.tree_output = "probability"151 # self.scaling = 1152 elif safe_isinstance(model, ["sklearn.ensemble.RandomForestClassifier",153 "sklearn.ensemble.forest.RandomForestClassifier"]):154 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"155 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type156 self.input_dtype = np.float32157 scaling = 1.0 / len(model.estimators_) # output is average of trees158 # self.scaling = scaling159 self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for160 e in model.estimators_]161 self.objective = objective_name_map.get(model.criterion, None)162 self.tree_output = "probability"163 elif safe_isinstance(model, ["sklearn.ensemble.ExtraTreesClassifier",164 "sklearn.ensemble.forest.ExtraTreesClassifier"]): # TODO: add unit test for this case165 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"166 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type167 self.input_dtype = np.float32168 scaling = 1.0 / len(model.estimators_) # output is average of trees169 # self.scaling = scaling170 self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for171 e in model.estimators_]172 self.objective = objective_name_map.get(model.criterion, None)173 self.tree_output = "probability"174 elif safe_isinstance(model, ["sklearn.ensemble.GradientBoostingRegressor",175 "sklearn.ensemble.gradient_boosting.GradientBoostingRegressor"]):176 self.input_dtype = np.float32177 # currently we only support the mean and quantile estimators178 if safe_isinstance(model.init_,179 ["sklearn.ensemble.MeanEstimator", "sklearn.ensemble.gradient_boosting.MeanEstimator"]):180 self.base_offset = model.init_.mean181 elif safe_isinstance(model.init_, ["sklearn.ensemble.QuantileEstimator",182 "sklearn.ensemble.gradient_boosting.QuantileEstimator"]):183 self.base_offset = model.init_.quantile184 elif safe_isinstance(model.init_, "sklearn.dummy.DummyRegressor"):185 self.base_offset = model.init_.constant_[0]186 else:187 assert False, "Unsupported init model type: " + str(type(model.init_))188 self.trees = [SingleTree(e.tree_, scaling=model.learning_rate, data=data, data_missing=data_missing) for e189 in model.estimators_[:, 0]]190 # self.scaling = model.learning_rate191 self.objective = objective_name_map.get(model.criterion, None)192 self.tree_output = "raw_value"193 elif safe_isinstance(model, ["sklearn.ensemble.HistGradientBoostingRegressor"]):194 import sklearn195 if self.model_output == "predict":196 self.model_output = "raw"197 self.input_dtype = sklearn.ensemble._hist_gradient_boosting.common.X_DTYPE198 self.base_offset = model._baseline_prediction199 self.trees = []200 for p in model._predictors:201 nodes = p[0].nodes202 # each node has values: ('value', 'count', 'feature_idx', 'threshold', 'missing_go_to_left', 'left', 'right', 'gain', 'depth', 'is_leaf', 'bin_threshold')203 tree = {204 "children_left": np.array([-1 if n[9] else n[5] for n in nodes]),205 "children_right": np.array([-1 if n[9] else n[6] for n in nodes]),206 "children_default": np.array([-1 if n[9] else (n[5] if n[4] else n[6]) for n in nodes]),207 "features": np.array([-2 if n[9] else n[2] for n in nodes]),208 "thresholds": np.array([n[3] for n in nodes], dtype=np.float64),209 "values": np.array([[n[0]] for n in nodes], dtype=np.float64),210 "node_sample_weight": np.array([n[1] for n in nodes], dtype=np.float64),211 }212 self.trees.append(SingleTree(tree, data=data, data_missing=data_missing))213 self.objective = objective_name_map.get(model.loss, None)214 self.tree_output = "raw_value"215 # self.scaling = 1216 elif safe_isinstance(model, ["sklearn.ensemble.HistGradientBoostingClassifier"]):217 import sklearn218 self.base_offset = model._baseline_prediction219 if hasattr(self.base_offset, "__len__") and self.model_output != "raw":220 raise Exception(221 "Multi-output HistGradientBoostingClassifier models are not yet supported unless model_output=\"raw\". See GitHub issue #1028")222 self.input_dtype = sklearn.ensemble._hist_gradient_boosting.common.X_DTYPE223 self.num_stacked_models = len(model._predictors[0])224 if self.model_output == "predict_proba":225 if self.num_stacked_models == 1:226 self.model_output = "probability_doubled" # with predict_proba we need to double the outputs to match227 else:228 self.model_output = "probability"229 self.trees = []230 for p in model._predictors:231 for i in range(self.num_stacked_models):232 nodes = p[i].nodes233 # each node has values: ('value', 'count', 'feature_idx', 'threshold', 'missing_go_to_left', 'left', 'right', 'gain', 'depth', 'is_leaf', 'bin_threshold')234 tree = {235 "children_left": np.array([-1 if n[9] else n[5] for n in nodes]),236 "children_right": np.array([-1 if n[9] else n[6] for n in nodes]),237 "children_default": np.array([-1 if n[9] else (n[5] if n[4] else n[6]) for n in nodes]),238 "features": np.array([-2 if n[9] else n[2] for n in nodes]),239 "thresholds": np.array([n[3] for n in nodes], dtype=np.float64),240 "values": np.array([[n[0]] for n in nodes], dtype=np.float64),241 "node_sample_weight": np.array([n[1] for n in nodes], dtype=np.float64),242 }243 self.trees.append(SingleTree(tree, data=data, data_missing=data_missing))244 self.objective = objective_name_map.get(model.loss, None)245 self.tree_output = "log_odds"246 # self.scaling = 1247 elif safe_isinstance(model, ["sklearn.ensemble.GradientBoostingClassifier",248 "sklearn.ensemble._gb.GradientBoostingClassifier",249 "sklearn.ensemble.gradient_boosting.GradientBoostingClassifier"]):250 self.input_dtype = np.float32251 # TODO: deal with estimators for each class252 if model.estimators_.shape[1] > 1:253 assert False, "GradientBoostingClassifier is only supported for binary classification right now!"254 # currently we only support the logs odds estimator255 if safe_isinstance(model.init_, ["sklearn.ensemble.LogOddsEstimator",256 "sklearn.ensemble.gradient_boosting.LogOddsEstimator"]):257 self.base_offset = model.init_.prior258 self.tree_output = "log_odds"259 elif safe_isinstance(model.init_, "sklearn.dummy.DummyClassifier"):260 self.base_offset = scipy.special.logit(model.init_.class_prior_[261 1]) # with two classes the trees only model the second class. # pylint: disable=no-member262 self.tree_output = "log_odds"263 else:264 assert False, "Unsupported init model type: " + str(type(model.init_))265 self.trees = [SingleTree(e.tree_, scaling=model.learning_rate, data=data, data_missing=data_missing) for e266 in model.estimators_[:, 0]]267 # self.scaling = model.learning_rate268 self.objective = objective_name_map.get(model.criterion, None)269 elif "pyspark.ml" in str(type(model)):270 assert_import("pyspark")271 self.model_type = "pyspark"272 # model._java_obj.getImpurity() can be gini, entropy or variance.273 self.objective = objective_name_map.get(model._java_obj.getImpurity(), None)274 if "Classification" in str(type(model)):275 normalize = True276 self.tree_output = "probability"277 else:278 normalize = False279 self.tree_output = "raw_value"280 # Spark Random forest, create 1 weighted (avg) tree per sub-model281 if safe_isinstance(model, "pyspark.ml.classification.RandomForestClassificationModel") \282 or safe_isinstance(model, "pyspark.ml.regression.RandomForestRegressionModel"):283 sum_weight = sum(model.treeWeights) # output is average of trees284 self.trees = [SingleTree(tree, normalize=normalize, scaling=model.treeWeights[i] / sum_weight) for285 i, tree in enumerate(model.trees)]286 # self.scaling = model.treeWeights[i] / sum_weight287 # Spark GBT, create 1 weighted (learning rate) tree per sub-model288 elif safe_isinstance(model, "pyspark.ml.classification.GBTClassificationModel") \289 or safe_isinstance(model, "pyspark.ml.regression.GBTRegressionModel"):290 self.objective = "squared_error" # GBT subtree use the variance291 self.tree_output = "raw_value"292 self.trees = [SingleTree(tree, normalize=False, scaling=model.treeWeights[i]) for i, tree in293 enumerate(model.trees)]294 # Spark Basic model (single tree)295 elif safe_isinstance(model, "pyspark.ml.classification.DecisionTreeClassificationModel") \296 or safe_isinstance(model, "pyspark.ml.regression.DecisionTreeRegressionModel"):297 self.trees = [SingleTree(model, normalize=normalize, scaling=1)]298 else:299 assert False, "Unsupported Spark model type: " + str(type(model))300 elif safe_isinstance(model, "xgboost.core.Booster"):301 import xgboost302 self.original_model = model303 self.model_type = "xgboost"304 xgb_loader = XGBTreeModelLoader(self.original_model)305 self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)306 self.base_offset = xgb_loader.base_score307 self.objective = objective_name_map.get(xgb_loader.name_obj, None)308 self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)309 if xgb_loader.num_class > 0:310 self.num_stacked_models = xgb_loader.num_class311 # self.scaling = 1312 elif safe_isinstance(model, "xgboost.sklearn.XGBClassifier"):313 import xgboost314 self.input_dtype = np.float32315 self.model_type = "xgboost"316 self.original_model = model.get_booster()317 xgb_loader = XGBTreeModelLoader(self.original_model)318 self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)319 self.base_offset = xgb_loader.base_score320 self.objective = objective_name_map.get(xgb_loader.name_obj, None)321 self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)322 self.tree_limit = getattr(model, "best_ntree_limit", None)323 if xgb_loader.num_class > 0:324 self.num_stacked_models = xgb_loader.num_class325 if self.model_output == "predict_proba":326 if self.num_stacked_models == 1:327 self.model_output = "probability_doubled" # with predict_proba we need to double the outputs to match328 else:329 self.model_output = "probability"330 # self.scaling = 1331 elif safe_isinstance(model, "xgboost.sklearn.XGBRegressor"):332 import xgboost333 self.original_model = model.get_booster()334 self.model_type = "xgboost"335 xgb_loader = XGBTreeModelLoader(self.original_model)336 self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)337 self.base_offset = xgb_loader.base_score338 self.objective = objective_name_map.get(xgb_loader.name_obj, None)339 self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)340 self.tree_limit = getattr(model, "best_ntree_limit", None)341 if xgb_loader.num_class > 0:342 self.num_stacked_models = xgb_loader.num_class343 # self.scaling = 1344 elif safe_isinstance(model, "xgboost.sklearn.XGBRanker"):345 import xgboost346 self.original_model = model.get_booster()347 self.model_type = "xgboost"348 xgb_loader = XGBTreeModelLoader(self.original_model)349 self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)350 self.base_offset = xgb_loader.base_score351 # Note: for ranker, leaving tree_output and objective as None as they352 # are not implemented in native code yet353 self.tree_limit = getattr(model, "best_ntree_limit", None)354 if xgb_loader.num_class > 0:355 self.num_stacked_models = xgb_loader.num_class356 # self.scaling = 1357 elif safe_isinstance(model, "lightgbm.basic.Booster"):358 assert_import("lightgbm")359 self.model_type = "lightgbm"360 self.original_model = model361 tree_info = self.original_model.dump_model()["tree_info"]362 try:363 self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]364 except:365 self.trees = None # we get here because the cext can't handle categorical splits yet366 self.objective = objective_name_map.get(model.params.get("objective", "regression"), None)367 self.tree_output = tree_output_name_map.get(model.params.get("objective", "regression"), None)368 # self.scaling = 1369 elif safe_isinstance(model, "gpboost.basic.Booster"):370 assert_import("gpboost")371 self.model_type = "gpboost"372 self.original_model = model373 tree_info = self.original_model.dump_model()["tree_info"]374 try:375 self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]376 except:377 self.trees = None # we get here because the cext can't handle categorical splits yet378 self.objective = objective_name_map.get(model.params.get("objective", "regression"), None)379 self.tree_output = tree_output_name_map.get(model.params.get("objective", "regression"), None)380 # self.scaling = 1381 elif safe_isinstance(model, "lightgbm.sklearn.LGBMRegressor"):382 assert_import("lightgbm")383 self.model_type = "lightgbm"384 self.original_model = model.booster_385 tree_info = self.original_model.dump_model()["tree_info"]386 try:387 self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]388 except:389 self.trees = None # we get here because the cext can't handle categorical splits yet390 self.objective = objective_name_map.get(model.objective, None)391 self.tree_output = tree_output_name_map.get(model.objective, None)392 if model.objective is None:393 self.objective = "squared_error"394 self.tree_output = "raw_value"395 # self.scaling = 1396 elif safe_isinstance(model, "lightgbm.sklearn.LGBMRanker"):397 assert_import("lightgbm")398 self.model_type = "lightgbm"399 self.original_model = model.booster_400 tree_info = self.original_model.dump_model()["tree_info"]401 try:402 self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]403 except:404 self.trees = None # we get here because the cext can't handle categorical splits yet405 # Note: for ranker, leaving tree_output and objective as None as they406 # are not implemented in native code yet407 # self.scaling = 1408 elif safe_isinstance(model, "lightgbm.sklearn.LGBMClassifier"):409 assert_import("lightgbm")410 self.model_type = "lightgbm"411 if model.n_classes_ > 2:412 self.num_stacked_models = model.n_classes_413 self.original_model = model.booster_414 tree_info = self.original_model.dump_model()["tree_info"]415 try:416 self.trees = [SingleTree(e, data=data, data_missing=data_missing) for e in tree_info]417 except:418 self.trees = None # we get here because the cext can't handle categorical splits yet419 self.objective = objective_name_map.get(model.objective, None)420 self.tree_output = tree_output_name_map.get(model.objective, None)421 if model.objective is None:422 self.objective = "binary_crossentropy"423 self.tree_output = "log_odds"424 # self.scaling = 1425 elif safe_isinstance(model, "catboost.core.CatBoostRegressor"):426 assert_import("catboost")427 self.model_type = "catboost"428 self.original_model = model429 self.cat_feature_indices = model.get_cat_feature_indices()430 # self.scaling = 1431 elif safe_isinstance(model, "catboost.core.CatBoostClassifier"):432 assert_import("catboost")433 self.model_type = "catboost"434 self.original_model = model435 self.input_dtype = np.float32436 try:437 cb_loader = CatBoostTreeModelLoader(model)438 self.trees = cb_loader.get_trees(data=data, data_missing=data_missing)439 except:440 self.trees = None # we get here because the cext can't handle categorical splits yet441 self.tree_output = "log_odds"442 self.objective = "binary_crossentropy"443 self.cat_feature_indices = model.get_cat_feature_indices()444 # self.scaling = 1445 elif safe_isinstance(model, "catboost.core.CatBoost"):446 assert_import("catboost")447 self.model_type = "catboost"448 self.original_model = model449 self.cat_feature_indices = model.get_cat_feature_indices()450 # self.scaling = 1451 elif safe_isinstance(model, "imblearn.ensemble._forest.BalancedRandomForestClassifier"):452 self.input_dtype = np.float32453 scaling = 1.0 / len(model.estimators_) # output is average of trees454 self.trees = [SingleTree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for455 e in model.estimators_]456 self.objective = objective_name_map.get(model.criterion, None)457 self.tree_output = "probability"458 # self.scaling = 1459 elif safe_isinstance(model, "ngboost.ngboost.NGBoost") or safe_isinstance(model,460 "ngboost.api.NGBRegressor") or safe_isinstance(461 model, "ngboost.api.NGBClassifier"):462 assert model.base_models, "The NGBoost model has empty `base_models`! Have you called `model.fit`?"463 if self.model_output == "raw":464 param_idx = 0 # default to the first parameter of the output distribution465 warnings.warn(466 "Translating model_ouput=\"raw\" to model_output=0 for the 0-th parameter in the distribution. Use model_output=0 directly to avoid this warning.")467 elif type(self.model_output) is int:468 param_idx = self.model_output469 self.model_output = "raw" # note that after loading we have a new model_output type470 assert safe_isinstance(model.base_models[0][param_idx], ["sklearn.tree.DecisionTreeRegressor",471 "sklearn.tree.tree.DecisionTreeRegressor"]), "You must use default_tree_learner!"472 shap_trees = [trees[param_idx] for trees in model.base_models]473 self.internal_dtype = shap_trees[0].tree_.value.dtype.type474 self.input_dtype = np.float32475 scaling = - model.learning_rate * np.array(model.scalings) # output is weighted average of trees476 self.trees = [SingleTree(e.tree_, scaling=s, data=data, data_missing=data_missing) for e, s in477 zip(shap_trees, scaling)]478 self.objective = objective_name_map.get(shap_trees[0].criterion, None)479 self.tree_output = "raw_value"480 self.base_offset = model.init_params[param_idx]481 # self.scaling = 1482 else:483 raise Exception("Model type not yet supported by TreeExplainer: " + str(type(model)))484 # build a dense numpy version of all the tree objects485 if self.trees is not None and self.trees:486 max_nodes = np.max([len(t.values) for t in self.trees])487 assert len(np.unique([t.values.shape[1] for t in488 self.trees])) == 1, "All trees in the ensemble must have the same output dimension!"489 num_trees = len(self.trees)490 if self.num_stacked_models > 1:491 assert len(492 self.trees) % self.num_stacked_models == 0, "Only stacked models with equal numbers of trees are supported!"493 assert self.trees[0].values.shape[494 1] == 1, "Only stacked models with single outputs per model are supported!"495 self.num_outputs = self.num_stacked_models496 else:497 self.num_outputs = self.trees[0].values.shape[1]498 if safe_isinstance(model, ["xgboost.sklearn.XGBClassifier",499 "catboost.core.CatBoostClassifier", "lightgbm.sklearn.LGBMClassifier"]) and \500 self.num_outputs == 1:501 self.values_binary = np.zeros((num_trees, max_nodes, 2), dtype=self.internal_dtype)502 for i in range(num_trees):503 # y = self.model.predict(self.data)504 # self.trees[i].values = np.zeros((max_nodes, self.num_outputs))505 # rebuild_acvtree(0, self.trees[i], self.data, y)506 # self.trees[i].values = self.trees[i].scaling * self.trees[i].values507 # p = np.exp(self.trees[i].values)/(1+np.exp(self.trees[i].values))508 p = 1/(1+np.exp(-self.trees[i].values))509 self.values_binary[i, :len(self.trees[i].values)] = np.concatenate([1-p, p], axis=1)/num_trees510 # important to be -1 in unused sections!! This way we can tell which entries are valid.511 self.children_left = -np.ones((num_trees, max_nodes), dtype=np.int32)512 self.children_right = -np.ones((num_trees, max_nodes), dtype=np.int32)513 self.children_default = -np.ones((num_trees, max_nodes), dtype=np.int32)514 self.features = -np.ones((num_trees, max_nodes), dtype=np.int32)515 self.thresholds = np.zeros((num_trees, max_nodes), dtype=self.internal_dtype)516 self.values = np.zeros((num_trees, max_nodes, self.num_outputs), dtype=self.internal_dtype)517 self.node_sample_weight = np.zeros((num_trees, max_nodes), dtype=self.internal_dtype)518 self.partition_leaves_trees = []519 self.node_idx_trees = []520 self.data_leaves_trees = []521 self.leaf_idx_trees = []522 self.leaves_nb = []523 self.scalings = []524 for i in tqdm(range(num_trees)):525 self.scalings.append(self.trees[i].scaling)526 self.children_left[i, :len(self.trees[i].children_left)] = self.trees[i].children_left527 self.children_right[i, :len(self.trees[i].children_right)] = self.trees[i].children_right528 self.children_default[i, :len(self.trees[i].children_default)] = self.trees[i].children_default529 self.features[i, :len(self.trees[i].features)] = self.trees[i].features530 self.thresholds[i, :len(self.trees[i].thresholds)] = self.trees[i].thresholds531 if self.num_stacked_models > 1:532 # stack_pos = int(i // (num_trees / self.num_stacked_models))533 stack_pos = i % self.num_stacked_models534 self.values[i, :len(self.trees[i].values[:, 0]), stack_pos] = self.trees[i].values[:, 0]535 else:536 self.values[i, :len(self.trees[i].values)] = self.trees[i].values537 self.node_sample_weight[i, :len(self.trees[i].node_sample_weight)] = self.trees[i].node_sample_weight538 # ensure that the passed background dataset lands in every leaf539 if np.min(self.trees[i].node_sample_weight) <= 0:540 self.fully_defined_weighting = False541 self.leaf_idx = [idx for idx in range(len(self.trees[i].features))542 if self.trees[i].children_left[idx] < 0]543 self.leaves_nb.append(len(self.leaf_idx))544 self.partition_leaves = []545 self.node_idx = []546 self.max_var = []547 self.data_leaves = []548 for leaf_id in self.leaf_idx:549 node_id = [-1]550 partition_leaf = [np.array([[-np.inf, np.inf]]) for idx2 in range(self.data.shape[1])]551 _ = get_partition(leaf_id, partition_leaf, node_id, self.trees[i].children_left,552 self.trees[i].children_right, self.trees[i].features, self.trees[i].thresholds)553 self.partition_leaves.append(np.squeeze(np.array(partition_leaf)))554 self.node_idx.append(list(set(node_id[1:])))555 self.max_var.append(len(self.node_idx[-1]))556 # self.data_leaves.append(np.array([(self.data[:, s] <= self.partition_leaves[-1][s, 1]) * \557 # (self.data[:, s] > self.partition_leaves[-1][s, 0])558 # for s in range(self.data.shape[1])], dtype=np.int).transpose())559 self.partition_leaves_trees.append(self.partition_leaves)560 # self.data_leaves_trees.append(self.data_leaves)561 self.node_idx_trees.append(self.node_idx)562 self.leaf_idx_trees.append(self.leaf_idx)563 leaf_idx_trees = -np.ones(shape=(len(self.leaves_nb), np.max(self.leaves_nb)), dtype=np.int)564 partition_leaves_trees = -np.ones(shape=(len(self.leaves_nb), np.max(self.leaves_nb), self.data.shape[1], 2))565 # data_leaves_trees = -np.ones(shape=(len(self.leaves_nb), np.max(self.leaves_nb), self.data.shape[0], self.data.shape[1]), dtype=np.int)566 for i in range(len(self.leaves_nb)):567 leaf_idx_trees[i, :self.leaves_nb[i]] = np.array(self.leaf_idx_trees[i], dtype=np.int)568 partition_leaves_trees[i, :self.leaves_nb[i]] = np.array(self.partition_leaves_trees[i])569 # data_leaves_trees[i, :self.leaves_nb[i]] = np.array(self.data_leaves_trees[i], dtype=np.int)570 self.leaf_idx_trees = leaf_idx_trees571 self.partition_leaves_trees = partition_leaves_trees572 self.leaves_nb = np.array(self.leaves_nb, dtype=np.int)573 self.scalings = np.array(self.scalings, dtype=np.float)574 self.data = np.array(self.data, dtype=np.float)575 self.max_var = np.max(self.max_var)576 # self.data_leaves_trees = data_leaves_trees577 # if safe_isinstance(model, ["xgboost.sklearn.XGBClassifier",578 # "catboost.core.CatBoostClassifier", "lightgbm.sklearn.LGBMClassifier"]) and \579 # self.num_outputs == 1:580 # p = np.exp(self.values)/(1 + np.exp(self.values))581 # print(np.max(p), np.min(1-p))582 # self.values = np.concatenate([1-p, p], axis=2)583 # self.num_outputs = 2584 self.num_nodes = np.array([len(t.values) for t in self.trees], dtype=np.int32)585 self.max_depth = np.max([t.max_depth for t in self.trees])586 if self.cache:587 if self.multi_threads:588 self.lm, self.lm_s, self.lm_si = self.leaves_cache(C=self.C)589 else:590 self.lm, self.lm_s, self.lm_si = self.leaves_cache_nopa(C=self.C)591 if self.cache_normalized:592 if self.multi_threads:593 self.lm_n, self.lm_s_n, self.lm_si_n = self.leaves_cache_normalized(C=self.C)594 else:595 self.lm_n, self.lm_s_n, self.lm_si_n = self.leaves_cache_normalized_nopa(C=self.C)596 # make sure the base offset is a 1D array597 if not hasattr(self.base_offset, "__len__") or len(self.base_offset) == 0:598 self.base_offset = (np.ones(self.num_outputs) * self.base_offset).astype(self.internal_dtype)599 self.base_offset = self.base_offset.flatten()600 assert len(self.base_offset) == self.num_outputs601 @abstractmethod602 def compute_cond_exp(self, X, S, data):603 pass604 @abstractmethod605 def shap_values(self, x, C):606 pass607 @abstractmethod608 def shap_values_acv(self, x, C, S_star, N_star):609 pass610 @abstractmethod611 def compute_sdp_clf(self, X, tX, S, data):612 pass613 @abstractmethod614 def compute_sdp_reg(self, X, tX, S, data):615 pass616 @abstractmethod617 def compute_local_sdp_clf(self, x, threshold, proba, index, data, final_coal, decay, C, verbose):618 pass619 @abstractmethod620 def compute_local_sdp_reg(self, x, threshold, proba, index, data, final_coal, decay, C, verbose):621 pass622 @abstractmethod623 def swing_values_clf(self, x, tx, S, data, threshold):624 pass625 @abstractmethod626 def swing_values_reg(self, x, tx, S, data, threshold):627 pass628 @abstractmethod629 def shap_values_swing_clf(self, x, tx, data, threshold, C):630 pass631 @abstractmethod632 def shap_values_swing_reg(self, x, tx, data, threshold, C):633 pass634 @abstractmethod635 def global_sdp_importance_clf(self, data, data_bground, columns_names, global_proba, decay, threshold,636 proba, C, verbose):637 pass638 @abstractmethod639 def global_sdp_importance_reg(self, data, data_bground, columns_names, global_proba, decay, threshold,640 proba, C, verbose):641 pass642 def predict(self, X, y=None, output=None, tree_limit=None):643 """ A consistent interface to make predictions from this model.644 Parameters645 ----------646 tree_limit : None (default) or int647 Limit the number of trees used by the model. By default None means no use the limit of the648 original model, and -1 means no limit.649 """650 if output is None:651 output = self.model_output652 if self.model_type == "pyspark":653 # import pyspark654 # TODO: support predict for pyspark655 raise NotImplementedError(656 "Predict with pyspark isn't implemented. Don't run 'interventional' as feature_perturbation.")657 # see if we have a default tree_limit in place.658 if tree_limit is None:659 tree_limit = -1 if self.tree_limit is None else self.tree_limit660 # convert dataframes661 if safe_isinstance(X, "pandas.core.series.Series"):662 X = X.values663 elif safe_isinstance(X, "pandas.core.frame.DataFrame"):664 X = X.values665 flat_output = False666 if len(X.shape) == 1:667 flat_output = True668 X = X.reshape(1, X.shape[0])669 if X.dtype.type != self.input_dtype:670 X = X.astype(self.input_dtype)671 X_missing = np.isnan(X, dtype=np.bool)672 assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))673 assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!"674 if tree_limit < 0 or tree_limit > self.values.shape[0]:675 tree_limit = self.values.shape[0]676 if output == "logloss":677 assert y is not None, "Both samples and labels must be provided when explaining the loss (i.e. `explainer.shap_values(X, y)`)!"678 assert X.shape[0] == len(679 y), "The number of labels (%d) does not match the number of samples to explain (%d)!" % (680 len(y), X.shape[0])681 # transform = self.get_transform()682 assert_import("cext_acv")683 output = np.zeros((X.shape[0], self.num_outputs))684 cext_acv.dense_tree_predict(685 self.children_left, self.children_right, self.children_default,686 self.features, self.thresholds, self.values,687 self.max_depth, tree_limit, self.base_offset,688 X, X_missing, output)689 # drop dimensions we don't need690 if flat_output:691 if self.num_outputs == 1:692 return output.flatten()[0]693 else:694 return output.reshape(-1, self.num_outputs)695 else:696 if self.num_outputs == 1:697 return output.flatten()698 else:699 return output700 # def single_predict(self, X, y=None, output=None, tree_limit=None, i=0):701 # """ A consistent interface to make predictions from this model.702 #703 # Parameters704 # ----------705 # tree_limit : None (default) or int706 # Limit the number of trees used by the model. By default None means no use the limit of the707 # original model, and -1 means no limit.708 # """709 #710 # if output is None:711 # output = self.model_output712 #713 # if self.model_type == "pyspark":714 # # import pyspark715 # # TODO: support predict for pyspark716 # raise NotImplementedError(717 # "Predict with pyspark isn't implemented. Don't run 'interventional' as feature_perturbation.")718 #719 # # see if we have a default tree_limit in place.720 # if tree_limit is None:721 # tree_limit = -1 if self.tree_limit is None else self.tree_limit722 #723 # # convert dataframes724 # if safe_isinstance(X, "pandas.core.series.Series"):725 # X = X.values726 # elif safe_isinstance(X, "pandas.core.frame.DataFrame"):727 # X = X.values728 # flat_output = False729 # if len(X.shape) == 1:730 # flat_output = True731 # X = X.reshape(1, X.shape[0])732 # if X.dtype.type != self.input_dtype:733 # X = X.astype(self.input_dtype)734 # X_missing = np.isnan(X, dtype=np.bool)735 # assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))736 # assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!"737 #738 # if tree_limit < 0 or tree_limit > self.values.shape[0]:739 # tree_limit = self.values.shape[0]740 #741 # if output == "logloss":742 # assert y is not None, "Both samples and labels must be provided when explaining the loss (i.e. `explainer.shap_values(X, y)`)!"743 # assert X.shape[0] == len(744 # y), "The number of labels (%d) does not match the number of samples to explain (%d)!" % (745 # len(y), X.shape[0])746 # # transform = self.get_transform()747 # assert_import("cext_acv")748 # output = np.zeros((X.shape[0], self.num_outputs))749 # cext_acv.single_tree_predict(750 # self.children_left, self.children_right, self.children_default,751 # self.features, self.thresholds, self.values,752 # self.max_depth, tree_limit, self.base_offset,753 # X, X_missing, output, i)754 #755 # # drop dimensions we don't need756 # if flat_output:757 # if self.num_outputs == 1:758 # return output.flatten()[0]/self.scaling759 # else:760 # return output.reshape(-1, self.num_outputs)/self.scaling761 # else:762 # if self.num_outputs == 1:763 # return output.flatten()/self.scaling764 # else:765 # return output/self.scaling766 # def shap_values(self, x, C=[[]]):767 # out = np.zeros((x.shape[0], x.shape[1], self.num_outputs))768 # for i in range(len(self.trees)):769 # out += shap_values_leaves(x, self.partition_leaves_trees[i], self.data_leaves_trees[i], self.node_idx_trees[i],770 # self.leaf_idx_trees[i], self.node_sample_weight[i], self.values[i], C, self.num_outputs)771 # return out772 #773 # def shap_values_acv(self, x, C=[[]]):774 # out = np.zeros((x.shape[0], x.shape[1], self.num_outputs))775 # for i in range(len(self.trees)):776 # out += shap_values_leaves(x, self.partition_leaves_trees[i], self.data_leaves_trees[i], self.node_idx_trees[i],777 # self.leaf_idx_trees[i], self.node_sample_weight[i], self.values[i], C, self.num_outputs)778 # return out779class SingleTree:780 """ A single decision tree.781 The primary point of this object is to parse many different tree types into a common format.782 """783 def __init__(self, tree, normalize=False, scaling=1.0, data=None, data_missing=None):784 self.scaling = scaling785 if safe_isinstance(tree, ["sklearn.tree._tree.Tree", "econml.tree._tree.Tree"]):786 self.children_left = tree.children_left.astype(np.int32)787 self.children_right = tree.children_right.astype(np.int32)788 self.children_default = self.children_left # missing values not supported in sklearn789 self.features = tree.feature.astype(np.int32)790 self.thresholds = tree.threshold.astype(np.float64)791 self.values = tree.value.reshape(tree.value.shape[0], tree.value.shape[1] * tree.value.shape[2])792 if normalize:793 self.values = (self.values.T / self.values.sum(1)).T794 self.values = self.values * scaling795 self.node_sample_weight = tree.weighted_n_node_samples.astype(np.float64)796 elif type(tree) is dict and 'features' in tree:797 self.children_left = tree["children_left"].astype(np.int32)798 self.children_right = tree["children_right"].astype(np.int32)799 self.children_default = tree["children_default"].astype(np.int32)800 self.features = tree["features"].astype(np.int32)801 self.thresholds = tree["thresholds"]802 self.values = tree["values"] * scaling803 self.node_sample_weight = tree["node_sample_weight"]804 # deprecated dictionary support (with sklearn singlular style "feature" and "value" names)805 elif type(tree) is dict and 'children_left' in tree:806 self.children_left = tree["children_left"].astype(np.int32)807 self.children_right = tree["children_right"].astype(np.int32)808 self.children_default = tree["children_default"].astype(np.int32)809 self.features = tree["feature"].astype(np.int32)810 self.thresholds = tree["threshold"]811 self.values = tree["value"] * scaling812 self.node_sample_weight = tree["node_sample_weight"]813 elif safe_isinstance(tree, "pyspark.ml.classification.DecisionTreeClassificationModel") \814 or safe_isinstance(tree, "pyspark.ml.regression.DecisionTreeRegressionModel"):815 # model._java_obj.numNodes() doesn't give leaves, need to recompute the size816 def getNumNodes(node, size):817 size = size + 1818 if node.subtreeDepth() == 0:819 return size820 else:821 size = getNumNodes(node.leftChild(), size)822 return getNumNodes(node.rightChild(), size)823 num_nodes = getNumNodes(tree._java_obj.rootNode(), 0)824 self.children_left = np.full(num_nodes, -2, dtype=np.int32)825 self.children_right = np.full(num_nodes, -2, dtype=np.int32)826 self.children_default = np.full(num_nodes, -2, dtype=np.int32)827 self.features = np.full(num_nodes, -2, dtype=np.int32)828 self.thresholds = np.full(num_nodes, -2, dtype=np.float64)829 self.values = [-2] * num_nodes830 self.node_sample_weight = np.full(num_nodes, -2, dtype=np.float64)831 def buildTree(index, node):832 index = index + 1833 if tree._java_obj.getImpurity() == 'variance':834 self.values[index] = [node.prediction()] # prediction for the node835 else:836 self.values[index] = [e for e in837 node.impurityStats().stats()] # for gini: NDarray(numLabel): 1 per label: number of item for each label which went through this node838 self.node_sample_weight[839 index] = node.impurityStats().count() # weighted count of element trough this node840 if node.subtreeDepth() == 0:841 return index842 else:843 self.features[844 index] = node.split().featureIndex() # index of the feature we split on, not available for leaf, int845 if str(node.split().getClass()).endswith('tree.CategoricalSplit'):846 # Categorical split isn't implemented, TODO: could fake it by creating a fake node to split on the exact value?847 raise NotImplementedError('CategoricalSplit are not yet implemented')848 self.thresholds[849 index] = node.split().threshold() # threshold for the feature, not available for leaf, float850 self.children_left[index] = index + 1851 idx = buildTree(index, node.leftChild())852 self.children_right[index] = idx + 1853 idx = buildTree(idx, node.rightChild())854 return idx855 buildTree(-1, tree._java_obj.rootNode())856 # default Not supported with mlib? (TODO)857 self.children_default = self.children_left858 self.values = np.asarray(self.values)859 if normalize:860 self.values = (self.values.T / self.values.sum(1)).T861 self.values = self.values * scaling862 elif type(tree) == dict and 'tree_structure' in tree: # LightGBM model dump863 start = tree['tree_structure']864 num_parents = tree['num_leaves'] - 1865 self.children_left = np.empty((2 * num_parents + 1), dtype=np.int32)866 self.children_right = np.empty((2 * num_parents + 1), dtype=np.int32)867 self.children_default = np.empty((2 * num_parents + 1), dtype=np.int32)868 self.features = np.empty((2 * num_parents + 1), dtype=np.int32)869 self.thresholds = np.empty((2 * num_parents + 1), dtype=np.float64)870 self.values = [-2] * (2 * num_parents + 1)871 self.node_sample_weight = np.empty((2 * num_parents + 1), dtype=np.float64)872 visited, queue = [], [start]873 while queue:874 vertex = queue.pop(0)875 if 'split_index' in vertex.keys():876 if vertex['split_index'] not in visited:877 if 'split_index' in vertex['left_child'].keys():878 self.children_left[vertex['split_index']] = vertex['left_child']['split_index']879 else:880 self.children_left[vertex['split_index']] = vertex['left_child']['leaf_index'] + num_parents881 if 'split_index' in vertex['right_child'].keys():882 self.children_right[vertex['split_index']] = vertex['right_child']['split_index']883 else:884 self.children_right[vertex['split_index']] = vertex['right_child'][885 'leaf_index'] + num_parents886 if vertex['default_left']:887 self.children_default[vertex['split_index']] = self.children_left[vertex['split_index']]888 else:889 self.children_default[vertex['split_index']] = self.children_right[vertex['split_index']]890 self.features[vertex['split_index']] = vertex['split_feature']891 self.thresholds[vertex['split_index']] = vertex['threshold']892 self.values[vertex['split_index']] = [vertex['internal_value']]893 self.node_sample_weight[vertex['split_index']] = vertex['internal_count']894 visited.append(vertex['split_index'])895 queue.append(vertex['left_child'])896 queue.append(vertex['right_child'])897 else:898 self.children_left[vertex['leaf_index'] + num_parents] = -1899 self.children_right[vertex['leaf_index'] + num_parents] = -1900 self.children_default[vertex['leaf_index'] + num_parents] = -1901 self.features[vertex['leaf_index'] + num_parents] = -1902 self.children_left[vertex['leaf_index'] + num_parents] = -1903 self.children_right[vertex['leaf_index'] + num_parents] = -1904 self.children_default[vertex['leaf_index'] + num_parents] = -1905 self.features[vertex['leaf_index'] + num_parents] = -1906 self.thresholds[vertex['leaf_index'] + num_parents] = -1907 self.values[vertex['leaf_index'] + num_parents] = [vertex['leaf_value']]908 self.node_sample_weight[vertex['leaf_index'] + num_parents] = vertex['leaf_count']909 self.values = np.asarray(self.values)910 self.values = np.multiply(self.values, scaling)911 elif type(tree) == dict and 'nodeid' in tree:912 """ Directly create tree given the JSON dump (with stats) of a XGBoost model.913 """914 def max_id(node):915 if "children" in node:916 return max(node["nodeid"], *[max_id(n) for n in node["children"]])917 else:918 return node["nodeid"]919 m = max_id(tree) + 1920 self.children_left = -np.ones(m, dtype=np.int32)921 self.children_right = -np.ones(m, dtype=np.int32)922 self.children_default = -np.ones(m, dtype=np.int32)923 self.features = -np.ones(m, dtype=np.int32)924 self.thresholds = np.zeros(m, dtype=np.float64)925 self.values = np.zeros((m, 1), dtype=np.float64)926 self.node_sample_weight = np.empty(m, dtype=np.float64)927 def extract_data(node, tree):928 i = node["nodeid"]929 tree.node_sample_weight[i] = node["cover"]930 if "children" in node:931 tree.children_left[i] = node["yes"]932 tree.children_right[i] = node["no"]933 tree.children_default[i] = node["missing"]934 tree.features[i] = node["split"]935 tree.thresholds[i] = node["split_condition"]936 for n in node["children"]:937 extract_data(n, tree)938 elif "leaf" in node:939 tree.values[i] = node["leaf"] * scaling940 extract_data(tree, self)941 elif type(tree) == str:942 """ Build a tree from a text dump (with stats) of xgboost.943 """944 nodes = [t.lstrip() for t in tree[:-1].split("\n")]945 nodes_dict = {}946 for n in nodes: nodes_dict[int(n.split(":")[0])] = n.split(":")[1]947 m = max(nodes_dict.keys()) + 1948 children_left = -1 * np.ones(m, dtype="int32")949 children_right = -1 * np.ones(m, dtype="int32")950 children_default = -1 * np.ones(m, dtype="int32")951 features = -2 * np.ones(m, dtype="int32")952 thresholds = -1 * np.ones(m, dtype="float64")953 values = 1 * np.ones(m, dtype="float64")954 node_sample_weight = np.zeros(m, dtype="float64")955 values_lst = list(nodes_dict.values())956 keys_lst = list(nodes_dict.keys())957 for i in range(0, len(keys_lst)):958 value = values_lst[i]959 key = keys_lst[i]960 if ("leaf" in value):961 # Extract values962 val = float(value.split("leaf=")[1].split(",")[0])963 node_sample_weight_val = float(value.split("cover=")[1])964 # Append to lists965 values[key] = val966 node_sample_weight[key] = node_sample_weight_val967 else:968 c_left = int(value.split("yes=")[1].split(",")[0])969 c_right = int(value.split("no=")[1].split(",")[0])970 c_default = int(value.split("missing=")[1].split(",")[0])971 feat_thres = value.split(" ")[0]972 if ("<" in feat_thres):973 feature = int(feat_thres.split("<")[0][2:])974 threshold = float(feat_thres.split("<")[1][:-1])975 if ("=" in feat_thres):976 feature = int(feat_thres.split("=")[0][2:])977 threshold = float(feat_thres.split("=")[1][:-1])978 node_sample_weight_val = float(value.split("cover=")[1].split(",")[0])979 children_left[key] = c_left980 children_right[key] = c_right981 children_default[key] = c_default982 features[key] = feature983 thresholds[key] = threshold984 node_sample_weight[key] = node_sample_weight_val985 self.children_left = children_left986 self.children_right = children_right987 self.children_default = children_default988 self.features = features989 self.thresholds = thresholds990 self.values = values[:, np.newaxis] * scaling991 self.node_sample_weight = node_sample_weight992 else:993 raise Exception("Unknown input to SingleTree constructor: " + str(tree))994 # Re-compute the number of samples that pass through each node if we are given data995 # if data is not None and data_missing is not None:996 # self.node_sample_weight[:] = 0.0997 # cext_acv.dense_tree_update_weights(998 # self.children_left, self.children_right, self.children_default, self.features,999 # self.thresholds, self.values, 1, self.node_sample_weight, data, data_missing1000 # )1001 # we compute the expectations to make sure they follow the SHAP logic1002 self.max_depth = cext_acv.compute_expectations(1003 self.children_left, self.children_right, self.node_sample_weight,1004 self.values1005 )1006 def predict(self, X):1007 # see if we have a default tree_limit in place.1008 children_left = np.expand_dims(self.children_left, 0)1009 children_right = np.expand_dims(self.children_right, 0)1010 children_default = np.expand_dims(self.children_default, 0)1011 features = np.expand_dims(self.features, 0)1012 thresholds = np.expand_dims(self.thresholds, 0)1013 values = np.expand_dims(self.values, 0) / self.scaling1014 # node_sample_weight = np.expand_dims(self.node_sample_weight, 0)1015 # convert dataframes1016 if safe_isinstance(X, "pandas.core.series.Series"):1017 X = X.values1018 elif safe_isinstance(X, "pandas.core.frame.DataFrame"):1019 X = X.values1020 flat_output = False1021 if len(X.shape) == 1:1022 flat_output = True1023 X = X.reshape(1, X.shape[0])1024 # if X.dtype.type != self.input_dtype:1025 # X = X.astype(self.input_dtype)1026 X_missing = np.isnan(X, dtype=np.bool)1027 assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))1028 assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!"1029 assert_import("cext_acv")1030 tree_limit = 11031 self.num_outputs = self.values.shape[1]1032 base_offset = np.zeros(self.num_outputs)1033 output = np.zeros((X.shape[0], self.num_outputs))1034 cext_acv.dense_tree_predict(1035 children_left, children_right, children_default, features, thresholds, values,1036 self.max_depth, tree_limit, base_offset, X, X_missing, output)1037 # drop dimensions we don't need1038 if flat_output:1039 if self.num_outputs == 1:1040 return output.flatten()[0]1041 else:1042 return output.reshape(-1, self.num_outputs)1043 else:1044 if self.num_outputs == 1:1045 return output.flatten()1046 else:1047 return output1048class IsoTree(SingleTree):1049 """1050 In sklearn the tree of the Isolation Forest does not calculated in a good way.1051 """1052 def __init__(self, tree, tree_features, normalize=False, scaling=1.0, data=None, data_missing=None):1053 super(IsoTree, self).__init__(tree, normalize, scaling, data, data_missing)1054 if safe_isinstance(tree, "sklearn.tree._tree.Tree"):1055 from sklearn.ensemble._iforest import _average_path_length # pylint: disable=no-name-in-module1056 def _recalculate_value(tree, i, level):1057 if tree.children_left[i] == -1 and tree.children_right[i] == -1:1058 value = level + _average_path_length(np.array([tree.n_node_samples[i]]))[0]1059 self.values[i, 0] = value1060 return value * tree.n_node_samples[i]1061 else:1062 value_left = _recalculate_value(tree, tree.children_left[i], level + 1)1063 value_right = _recalculate_value(tree, tree.children_right[i], level + 1)1064 self.values[i, 0] = (value_left + value_right) / tree.n_node_samples[i]1065 return value_left + value_right1066 _recalculate_value(tree, 0, 0)1067 if normalize:1068 self.values = (self.values.T / self.values.sum(1)).T...
tree.py
Source:tree.py
...80 raise Exception("feature_perturbation = \"independent\" is not a valid option value, please use " \81 "feature_perturbation == \"interventional\" instead. See GitHub issue #882.")82 83 84 if safe_isinstance(data, "pandas.core.frame.DataFrame"):85 self.data = data.values86 elif isinstance(data, DenseData):87 self.data = data.data88 else:89 self.data = data90 if self.data is None:91 feature_perturbation = "tree_path_dependent"92 warnings.warn("Setting feature_perturbation = \"tree_path_dependent\" because no background data was given.")93 elif feature_perturbation == "interventional" and self.data.shape[0] > 1000:94 warnings.warn("Passing "+str(self.data.shape[0]) + " background samples may lead to slow runtimes. Consider "95 "using shap.sample(data, 100) to create a smaller background data set.")96 self.data_missing = None if self.data is None else np.isnan(self.data)97 self.model_output = model_output98 self.feature_perturbation = feature_perturbation99 self.expected_value = None100 self.model = TreeEnsemble(model, self.data, self.data_missing)101 assert feature_perturbation in feature_perturbation_codes, "Invalid feature_perturbation option!"102 # check for unsupported combinations of feature_perturbation and model_outputs103 if feature_perturbation == "tree_path_dependent":104 assert model_output == "margin", "Only margin model_output is supported for feature_perturbation=\"tree_path_dependent\""105 else: 106 assert data is not None, "A background dataset must be provided unless you are using feature_perturbation=\"tree_path_dependent\"!"107 if model_output != "margin":108 if self.model.objective is None and self.model.tree_output is None:109 raise Exception("Model does not have a known objective or output type! When model_output is " \110 "not \"margin\" then we need to know the model's objective or link function.")111 # A bug in XGBoost fixed in v0.81 makes XGBClassifier fail to give margin outputs112 if safe_isinstance(model, "xgboost.sklearn.XGBClassifier") and model_output != "margin":113 import xgboost114 assert LooseVersion(xgboost.__version__) >= LooseVersion('0.81'), \115 "A bug in XGBoost fixed in v0.81 makes XGBClassifier fail to give margin outputs! Please upgrade to XGBoost >= v0.81!"116 117 # compute the expected value if we have a parsed tree for the cext118 if self.model_output == "logloss":119 self.expected_value = self.__dynamic_expected_value120 elif data is not None:121 self.expected_value = self.model.predict(self.data, output=model_output).mean(0)122 if hasattr(self.expected_value, '__len__') and len(self.expected_value) == 1:123 self.expected_value = self.expected_value[0]124 elif hasattr(self.model, "node_sample_weight"):125 self.expected_value = self.model.values[:,0].sum(0)126 if self.expected_value.size == 1:127 self.expected_value = self.expected_value[0]128 self.expected_value += self.model.base_offset129 def __dynamic_expected_value(self, y):130 """ This computes the expected value conditioned on the given label value.131 """132 return self.model.predict(self.data, np.ones(self.data.shape[0]) * y, output=self.model_output).mean(0)133 134 def shap_values(self, X, y=None, tree_limit=None, approximate=False, check_additivity=True):135 """ Estimate the SHAP values for a set of samples.136 Parameters137 ----------138 X : numpy.array, pandas.DataFrame or catboost.Pool (for catboost)139 A matrix of samples (# samples x # features) on which to explain the model's output.140 y : numpy.array141 An array of label values for each sample. Used when explaining loss functions.142 tree_limit : None (default) or int 143 Limit the number of trees used by the model. By default None means no use the limit of the144 original model, and -1 means no limit.145 approximate : bool146 Run fast, but only roughly approximate the Tree SHAP values. This runs a method147 previously proposed by Saabas which only considers a single feature ordering. Take care148 since this does not have the consistency guarantees of Shapley values and places too149 much weight on lower splits in the tree.150 check_additivity : bool151 Run a validation check that the sum of the SHAP values equals the output of the model. This152 check takes only a small amount of time, and will catch potential unforeseen errors.153 Note that this check only runs right now when explaining the margin of the model.154 Returns155 -------156 For models with a single output this returns a matrix of SHAP values157 (# samples x # features). Each row sums to the difference between the model output for that158 sample and the expected value of the model output (which is stored in the expected_value159 attribute of the explainer when it is constant). For models with vector outputs this returns160 a list of such matrices, one for each output.161 """162 # see if we have a default tree_limit in place.163 if tree_limit is None:164 tree_limit = -1 if self.model.tree_limit is None else self.model.tree_limit165 # shortcut using the C++ version of Tree SHAP in XGBoost, LightGBM, and CatBoost166 if self.feature_perturbation == "tree_path_dependent" and self.model.model_type != "internal" and self.data is None:167 model_output_vals = None168 phi = None169 if self.model.model_type == "xgboost":170 import xgboost171 if not isinstance(X, xgboost.core.DMatrix):172 X = xgboost.DMatrix(X)173 if tree_limit == -1:174 tree_limit = 0175 phi = self.model.original_model.predict(176 X, ntree_limit=tree_limit, pred_contribs=True,177 approx_contribs=approximate, validate_features=False178 )179 180 if check_additivity and self.model_output == "margin":181 model_output_vals = self.model.original_model.predict(182 X, ntree_limit=tree_limit, output_margin=True,183 validate_features=False184 )185 186 elif self.model.model_type == "lightgbm":187 assert not approximate, "approximate=True is not supported for LightGBM models!"188 phi = self.model.original_model.predict(X, num_iteration=tree_limit, pred_contrib=True)189 # Note: the data must be joined on the last axis190 if self.model.original_model.params['objective'] == 'binary':191 warnings.warn('LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray')192 phi = np.concatenate((0-phi, phi), axis=-1)193 if phi.shape[1] != X.shape[1] + 1:194 phi = phi.reshape(X.shape[0], phi.shape[1]//(X.shape[1]+1), X.shape[1]+1)195 196 elif self.model.model_type == "catboost": # thanks to the CatBoost team for implementing this...197 assert not approximate, "approximate=True is not supported for CatBoost models!"198 assert tree_limit == -1, "tree_limit is not yet supported for CatBoost models!"199 import catboost200 if type(X) != catboost.Pool:201 X = catboost.Pool(X)202 phi = self.model.original_model.get_feature_importance(data=X, fstr_type='ShapValues')203 # note we pull off the last column and keep it as our expected_value204 if phi is not None:205 if len(phi.shape) == 3:206 self.expected_value = [phi[0, i, -1] for i in range(phi.shape[1])]207 out = [phi[:, i, :-1] for i in range(phi.shape[1])]208 else:209 self.expected_value = phi[0, -1]210 out = phi[:, :-1]211 212 if check_additivity and model_output_vals is not None:213 self.assert_additivity(out, model_output_vals)214 return out215 # convert dataframes216 if safe_isinstance(X, "pandas.core.series.Series"):217 X = X.values218 elif safe_isinstance(X, "pandas.core.frame.DataFrame"):219 X = X.values220 flat_output = False221 if len(X.shape) == 1:222 flat_output = True223 X = X.reshape(1, X.shape[0])224 if X.dtype != self.model.input_dtype:225 X = X.astype(self.model.input_dtype)226 X_missing = np.isnan(X, dtype=np.bool)227 assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))228 assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!"229 if tree_limit < 0 or tree_limit > self.model.values.shape[0]:230 tree_limit = self.model.values.shape[0]231 232 if self.model_output == "logloss":233 assert y is not None, "Both samples and labels must be provided when explaining the loss (i.e. `explainer.shap_values(X, y)`)!"234 assert X.shape[0] == len(y), "The number of labels (%d) does not match the number of samples to explain (%d)!" % (len(y), X.shape[0])235 transform = self.model.get_transform(self.model_output)236 if self.feature_perturbation == "tree_path_dependent":237 assert self.model.fully_defined_weighting, "The background dataset you provided does not cover all the leaves in the model, " \238 "so TreeExplainer cannot run with the feature_perturbation=\"tree_path_dependent\" option! " \239 "Try providing a larger background dataset, or using feature_perturbation=\"interventional\"."240 241 # run the core algorithm using the C extension242 assert_import("cext")243 phi = np.zeros((X.shape[0], X.shape[1]+1, self.model.n_outputs))244 if not approximate:245 _cext.dense_tree_shap(246 self.model.children_left, self.model.children_right, self.model.children_default,247 self.model.features, self.model.thresholds, self.model.values, self.model.node_sample_weight,248 self.model.max_depth, X, X_missing, y, self.data, self.data_missing, tree_limit,249 self.model.base_offset, phi, feature_perturbation_codes[self.feature_perturbation],250 output_transform_codes[transform], False251 )252 else:253 _cext.dense_tree_saabas(254 self.model.children_left, self.model.children_right, self.model.children_default,255 self.model.features, self.model.thresholds, self.model.values,256 self.model.max_depth, tree_limit, self.model.base_offset, output_transform_codes[transform], 257 X, X_missing, y, phi258 )259 # note we pull off the last column and keep it as our expected_value260 if self.model.n_outputs == 1:261 if self.model_output != "logloss":262 self.expected_value = phi[0, -1, 0]263 if flat_output:264 out = phi[0, :-1, 0]265 else:266 out = phi[:, :-1, 0]267 else:268 if self.model_output != "logloss":269 self.expected_value = [phi[0, -1, i] for i in range(phi.shape[2])]270 if flat_output:271 out = [phi[0, :-1, i] for i in range(self.model.n_outputs)]272 else:273 out = [phi[:, :-1, i] for i in range(self.model.n_outputs)]274 if check_additivity and self.model_output == "margin":275 self.assert_additivity(out, self.model.predict(X))276 return out277 def shap_interaction_values(self, X, y=None, tree_limit=None):278 """ Estimate the SHAP interaction values for a set of samples.279 Parameters280 ----------281 X : numpy.array, pandas.DataFrame or catboost.Pool (for catboost)282 A matrix of samples (# samples x # features) on which to explain the model's output.283 y : numpy.array284 An array of label values for each sample. Used when explaining loss functions (not yet supported).285 tree_limit : None (default) or int 286 Limit the number of trees used by the model. By default None means no use the limit of the287 original model, and -1 means no limit.288 Returns289 -------290 For models with a single output this returns a tensor of SHAP values291 (# samples x # features x # features). The matrix (# features x # features) for each sample sums292 to the difference between the model output for that sample and the expected value of the model output293 (which is stored in the expected_value attribute of the explainer). Each row of this matrix sums to the294 SHAP value for that feature for that sample. The diagonal entries of the matrix represent the295 "main effect" of that feature on the prediction and the symmetric off-diagonal entries represent the296 interaction effects between all pairs of features for that sample. For models with vector outputs297 this returns a list of tensors, one for each output.298 """299 assert self.model_output == "margin", "Only model_output = \"margin\" is supported for SHAP interaction values right now!"300 assert self.feature_perturbation == "tree_path_dependent", "Only feature_perturbation = \"tree_path_dependent\" is supported for SHAP interaction values right now!"301 transform = "identity"302 # see if we have a default tree_limit in place.303 if tree_limit is None:304 tree_limit = -1 if self.model.tree_limit is None else self.model.tree_limit305 # shortcut using the C++ version of Tree SHAP in XGBoost306 if self.model.model_type == "xgboost":307 import xgboost308 if not isinstance(X, xgboost.core.DMatrix):309 X = xgboost.DMatrix(X)310 if tree_limit == -1:311 tree_limit = 0312 phi = self.model.original_model.predict(X, ntree_limit=tree_limit, pred_interactions=True)313 # note we pull off the last column and keep it as our expected_value314 if len(phi.shape) == 4:315 self.expected_value = [phi[0, i, -1, -1] for i in range(phi.shape[1])]316 return [phi[:, i, :-1, :-1] for i in range(phi.shape[1])]317 else:318 self.expected_value = phi[0, -1, -1]319 return phi[:, :-1, :-1]320 # convert dataframes321 if safe_isinstance(X, "pandas.core.series.Series"):322 X = X.values323 elif safe_isinstance(X, "pandas.core.frame.DataFrame"):324 X = X.values325 flat_output = False326 if len(X.shape) == 1:327 flat_output = True328 X = X.reshape(1, X.shape[0])329 if X.dtype != self.model.input_dtype:330 X = X.astype(self.model.input_dtype)331 X_missing = np.isnan(X, dtype=np.bool)332 assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))333 assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!"334 if tree_limit < 0 or tree_limit > self.model.values.shape[0]:335 tree_limit = self.model.values.shape[0]336 # run the core algorithm using the C extension337 assert_import("cext")338 phi = np.zeros((X.shape[0], X.shape[1]+1, X.shape[1]+1, self.model.n_outputs))339 _cext.dense_tree_shap(340 self.model.children_left, self.model.children_right, self.model.children_default,341 self.model.features, self.model.thresholds, self.model.values, self.model.node_sample_weight,342 self.model.max_depth, X, X_missing, y, self.data, self.data_missing, tree_limit,343 self.model.base_offset, phi, feature_perturbation_codes[self.feature_perturbation],344 output_transform_codes[transform], True345 )346 # note we pull off the last column and keep it as our expected_value347 if self.model.n_outputs == 1:348 self.expected_value = phi[0, -1, -1, 0]349 if flat_output:350 out = phi[0, :-1, :-1, 0]351 else:352 out = phi[:, :-1, :-1, 0]353 else:354 self.expected_value = [phi[0, -1, -1, i] for i in range(phi.shape[3])]355 if flat_output:356 out = [phi[0, :-1, :-1, i] for i in range(self.model.n_outputs)]357 else:358 out = [phi[:, :-1, :-1, i] for i in range(self.model.n_outputs)]359 360 return out361 def assert_additivity(self, phi, model_output):362 err_msg = "Additivity check failed in TreeExplainer! Please report this on GitHub."363 if self.feature_perturbation != "interventional":364 err_msg += " Consider retrying with the feature_perturbation='interventional' option."365 if type(phi) is list:366 for i in range(len(phi)):367 val = self.expected_value[i] + phi[i].sum(-1)368 assert np.max(np.abs(val - model_output[:,i]) / (np.abs(val) + 1e-4)) < 1e-2, err_msg369 else:370 val = self.expected_value + phi.sum(-1)371 assert np.max(np.abs(val - model_output) / (np.abs(val) + 1e-4)) < 1e-2, err_msg372class TreeEnsemble:373 """ An ensemble of decision trees.374 This object provides a common interface to many different types of models.375 """376 def __init__(self, model, data=None, data_missing=None):377 self.model_type = "internal"378 self.trees = None379 less_than_or_equal = True380 self.base_offset = 0381 self.objective = None # what we explain when explaining the loss of the model382 self.tree_output = None # what are the units of the values in the leaves of the trees383 self.internal_dtype = np.float64384 self.input_dtype = np.float64 # for sklearn we need to use np.float32 to always get exact matches to their predictions385 self.data = data386 self.data_missing = data_missing387 self.fully_defined_weighting = True # does the background dataset land in every leaf (making it valid for the tree_path_dependent method)388 self.tree_limit = None # used for limiting the number of trees we use by default (like from early stopping) 389 # we use names like keras390 objective_name_map = {391 "mse": "squared_error",392 "variance": "squared_error",393 "friedman_mse": "squared_error",394 "reg:linear": "squared_error",395 "reg:squarederror": "squared_error",396 "regression": "squared_error",397 "regression_l2": "squared_error",398 "mae": "absolute_error",399 "gini": "binary_crossentropy",400 "entropy": "binary_crossentropy",401 "binary:logistic": "binary_crossentropy",402 "binary_logloss": "binary_crossentropy",403 "binary": "binary_crossentropy"404 }405 tree_output_name_map = {406 "regression": "raw_value",407 "regression_l2": "squared_error",408 "reg:linear": "raw_value",409 "reg:squarederror": "raw_value",410 "binary:logistic": "log_odds",411 "binary_logloss": "log_odds",412 "binary": "log_odds"413 }414 if type(model) is dict and "trees" in model:415 # This allows a dictionary to be passed that represents the model.416 # this dictionary has several numerica paramters and also a list of trees417 # where each tree is a dictionary describing that tree418 if "internal_dtype" in model:419 self.internal_dtype = model["internal_dtype"]420 if "input_dtype" in model:421 self.input_dtype = model["input_dtype"]422 if "objective" in model:423 self.objective = model["objective"]424 if "tree_output" in model:425 self.tree_output = model["tree_output"]426 if "base_offset" in model:427 self.base_offset = model["base_offset"]428 self.trees = [Tree(t, data=data, data_missing=data_missing) for t in model["trees"]]429 elif type(model) is list and type(model[0]) == Tree: # old-style direct-load format430 self.trees = model431 elif safe_isinstance(model, "sklearn.ensemble.forest.RandomForestRegressor"):432 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"433 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type434 self.input_dtype = np.float32435 scaling = 1.0 / len(model.estimators_) # output is average of trees436 self.trees = [Tree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]437 self.objective = objective_name_map.get(model.criterion, None)438 self.tree_output = "raw_value"439 elif safe_isinstance(model, "sklearn.ensemble.iforest.IsolationForest"):440 self.dtype = np.float32441 scaling = 1.0 / len(model.estimators_) # output is average of trees442 self.trees = [IsoTree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]443 self.tree_output = "raw_value"444 elif safe_isinstance(model, "skopt.learning.forest.RandomForestRegressor"):445 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"446 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type447 self.input_dtype = np.float32448 scaling = 1.0 / len(model.estimators_) # output is average of trees449 self.trees = [Tree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]450 self.objective = objective_name_map.get(model.criterion, None)451 self.tree_output = "raw_value"452 elif safe_isinstance(model, "sklearn.ensemble.forest.ExtraTreesRegressor"):453 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"454 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type455 self.input_dtype = np.float32456 scaling = 1.0 / len(model.estimators_) # output is average of trees457 self.trees = [Tree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]458 self.objective = objective_name_map.get(model.criterion, None)459 self.tree_output = "raw_value"460 elif safe_isinstance(model, "skopt.learning.forest.ExtraTreesRegressor"):461 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"462 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type463 self.input_dtype = np.float32464 scaling = 1.0 / len(model.estimators_) # output is average of trees465 self.trees = [Tree(e.tree_, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]466 self.objective = objective_name_map.get(model.criterion, None)467 self.tree_output = "raw_value"468 elif safe_isinstance(model, "sklearn.tree.tree.DecisionTreeRegressor"):469 self.internal_dtype = model.tree_.value.dtype.type470 self.input_dtype = np.float32471 self.trees = [Tree(model.tree_, data=data, data_missing=data_missing)]472 self.objective = objective_name_map.get(model.criterion, None)473 self.tree_output = "raw_value"474 elif safe_isinstance(model, "sklearn.tree.tree.DecisionTreeClassifier"):475 self.internal_dtype = model.tree_.value.dtype.type476 self.input_dtype = np.float32477 self.trees = [Tree(model.tree_, normalize=True, data=data, data_missing=data_missing)]478 self.objective = objective_name_map.get(model.criterion, None)479 self.tree_output = "probability"480 elif safe_isinstance(model, "sklearn.ensemble.forest.RandomForestClassifier"):481 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"482 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type483 self.input_dtype = np.float32484 scaling = 1.0 / len(model.estimators_) # output is average of trees485 self.trees = [Tree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]486 self.objective = objective_name_map.get(model.criterion, None)487 self.tree_output = "probability"488 elif safe_isinstance(model, "sklearn.ensemble.forest.ExtraTreesClassifier"): # TODO: add unit test for this case489 assert hasattr(model, "estimators_"), "Model has no `estimators_`! Have you called `model.fit`?"490 self.internal_dtype = model.estimators_[0].tree_.value.dtype.type491 self.input_dtype = np.float32492 scaling = 1.0 / len(model.estimators_) # output is average of trees493 self.trees = [Tree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]494 self.objective = objective_name_map.get(model.criterion, None)495 self.tree_output = "probability"496 elif safe_isinstance(model, "sklearn.ensemble.gradient_boosting.GradientBoostingRegressor"):497 self.input_dtype = np.float32498 # currently we only support the mean and quantile estimators499 if safe_isinstance(model.init_, "sklearn.ensemble.gradient_boosting.MeanEstimator"):500 self.base_offset = model.init_.mean501 elif safe_isinstance(model.init_, "sklearn.ensemble.gradient_boosting.QuantileEstimator"):502 self.base_offset = model.init_.quantile503 elif safe_isinstance(model.init_, "sklearn.dummy.DummyRegressor"):504 self.base_offset = model.init_.constant_[0]505 else:506 assert False, "Unsupported init model type: " + str(type(model.init_))507 self.trees = [Tree(e.tree_, scaling=model.learning_rate, data=data, data_missing=data_missing) for e in model.estimators_[:,0]]508 self.objective = objective_name_map.get(model.criterion, None)509 self.tree_output = "raw_value"510 elif safe_isinstance(model, "sklearn.ensemble.gradient_boosting.GradientBoostingClassifier"):511 self.input_dtype = np.float32512 # TODO: deal with estimators for each class513 if model.estimators_.shape[1] > 1:514 assert False, "GradientBoostingClassifier is only supported for binary classification right now!"515 516 # currently we only support the logs odds estimator517 if safe_isinstance(model.init_, "sklearn.ensemble.gradient_boosting.LogOddsEstimator"):518 self.base_offset = model.init_.prior519 self.tree_output = "log_odds"520 elif safe_isinstance(model.init_, "sklearn.dummy.DummyClassifier"):521 self.base_offset = scipy.special.logit(model.init_.class_prior_[1]) # with two classes the trees only model the second class522 self.tree_output = "log_odds"523 else:524 assert False, "Unsupported init model type: " + str(type(model.init_))525 self.trees = [Tree(e.tree_, scaling=model.learning_rate, data=data, data_missing=data_missing) for e in model.estimators_[:,0]]526 self.objective = objective_name_map.get(model.criterion, None)527 elif "pyspark.ml" in str(type(model)):528 assert_import("pyspark")529 self.original_model = model530 self.model_type = "pyspark"531 # model._java_obj.getImpurity() can be gini, entropy or variance.532 self.objective = objective_name_map.get(model._java_obj.getImpurity(), None)533 if "Classification" in str(type(model)):534 normalize = True535 self.tree_output = "probability"536 else:537 normalize = False538 self.tree_output = "raw_value"539 # Spark Random forest, create 1 weighted (avg) tree per sub-model540 if safe_isinstance(model, "pyspark.ml.classification.RandomForestClassificationModel") \541 or safe_isinstance(model, "pyspark.ml.regression.RandomForestRegressionModel"):542 sum_weight = sum(model.treeWeights) # output is average of trees543 self.trees = [Tree(tree, normalize=normalize, scaling=model.treeWeights[i]/sum_weight) for i, tree in enumerate(model.trees)]544 # Spark GBT, create 1 weighted (learning rate) tree per sub-model545 elif safe_isinstance(model, "pyspark.ml.classification.GBTClassificationModel") \546 or safe_isinstance(model, "pyspark.ml.regression.GBTRegressionModel"):547 self.objective = "squared_error" # GBT subtree use the variance548 self.tree_output = "raw_value"549 self.trees = [Tree(tree, normalize=False, scaling=model.treeWeights[i]) for i, tree in enumerate(model.trees)]550 # Spark Basic model (single tree)551 elif safe_isinstance(model, "pyspark.ml.classification.DecisionTreeClassificationModel") \552 or safe_isinstance(model, "pyspark.ml.regression.DecisionTreeRegressionModel"):553 self.trees = [Tree(model, normalize=normalize, scaling=1)]554 else:555 assert False, "Unsupported Spark model type: " + str(type(model))556 elif safe_isinstance(model, "xgboost.core.Booster"):557 import xgboost558 self.original_model = model559 self.model_type = "xgboost"560 xgb_loader = XGBTreeModelLoader(self.original_model)561 self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)562 self.base_offset = xgb_loader.base_score563 less_than_or_equal = False564 self.objective = objective_name_map.get(xgb_loader.name_obj, None)565 self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)566 elif safe_isinstance(model, "xgboost.sklearn.XGBClassifier"):567 import xgboost568 self.input_dtype = np.float32569 self.model_type = "xgboost"570 self.original_model = model.get_booster()571 xgb_loader = XGBTreeModelLoader(self.original_model)572 self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)573 self.base_offset = xgb_loader.base_score574 less_than_or_equal = False575 self.objective = objective_name_map.get(xgb_loader.name_obj, None)576 self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)577 self.tree_limit = getattr(model, "best_ntree_limit", None)578 elif safe_isinstance(model, "xgboost.sklearn.XGBRegressor"):579 import xgboost580 self.original_model = model.get_booster()581 self.model_type = "xgboost"582 xgb_loader = XGBTreeModelLoader(self.original_model)583 self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)584 self.base_offset = xgb_loader.base_score585 less_than_or_equal = False586 self.objective = objective_name_map.get(xgb_loader.name_obj, None)587 self.tree_output = tree_output_name_map.get(xgb_loader.name_obj, None)588 self.tree_limit = getattr(model, "best_ntree_limit", None)589 elif safe_isinstance(model, "xgboost.sklearn.XGBRanker"):590 import xgboost591 self.original_model = model.get_booster()592 self.model_type = "xgboost"593 xgb_loader = XGBTreeModelLoader(self.original_model)594 self.trees = xgb_loader.get_trees(data=data, data_missing=data_missing)595 self.base_offset = xgb_loader.base_score596 less_than_or_equal = False597 # Note: for ranker, leaving tree_output and objective as None as they598 # are not implemented in native code yet599 self.tree_limit = getattr(model, "best_ntree_limit", None)600 elif safe_isinstance(model, "lightgbm.basic.Booster"):601 assert_import("lightgbm")602 self.model_type = "lightgbm"603 self.original_model = model604 tree_info = self.original_model.dump_model()["tree_info"]605 try:606 self.trees = [Tree(e, data=data, data_missing=data_missing) for e in tree_info]607 except:608 self.trees = None # we get here because the cext can't handle categorical splits yet609 610 self.objective = objective_name_map.get(model.params.get("objective", "regression"), None)611 self.tree_output = tree_output_name_map.get(model.params.get("objective", "regression"), None)612 613 elif safe_isinstance(model, "lightgbm.sklearn.LGBMRegressor"):614 assert_import("lightgbm")615 self.model_type = "lightgbm"616 self.original_model = model.booster_617 tree_info = self.original_model.dump_model()["tree_info"]618 try:619 self.trees = [Tree(e, data=data, data_missing=data_missing) for e in tree_info]620 except:621 self.trees = None # we get here because the cext can't handle categorical splits yet622 self.objective = objective_name_map.get(model.objective, None)623 self.tree_output = tree_output_name_map.get(model.objective, None)624 if model.objective is None:625 self.objective = "squared_error"626 self.tree_output = "raw_value"627 elif safe_isinstance(model, "lightgbm.sklearn.LGBMRanker"):628 assert_import("lightgbm")629 self.model_type = "lightgbm"630 self.original_model = model.booster_631 tree_info = self.original_model.dump_model()["tree_info"]632 try:633 self.trees = [Tree(e, data=data, data_missing=data_missing) for e in tree_info]634 except:635 self.trees = None # we get here because the cext can't handle categorical splits yet636 # Note: for ranker, leaving tree_output and objective as None as they637 # are not implemented in native code yet638 elif safe_isinstance(model, "lightgbm.sklearn.LGBMClassifier"):639 assert_import("lightgbm")640 self.model_type = "lightgbm"641 self.original_model = model.booster_642 tree_info = self.original_model.dump_model()["tree_info"]643 try:644 self.trees = [Tree(e, data=data, data_missing=data_missing) for e in tree_info]645 except:646 self.trees = None # we get here because the cext can't handle categorical splits yet647 self.objective = objective_name_map.get(model.objective, None)648 self.tree_output = tree_output_name_map.get(model.objective, None)649 if model.objective is None:650 self.objective = "binary_crossentropy"651 self.tree_output = "log_odds"652 elif safe_isinstance(model, "catboost.core.CatBoostRegressor"):653 assert_import("catboost")654 self.model_type = "catboost"655 self.original_model = model656 elif safe_isinstance(model, "catboost.core.CatBoostClassifier"):657 assert_import("catboost")658 self.model_type = "catboost"659 self.original_model = model660 self.input_dtype = np.float32661 cb_loader = CatBoostTreeModelLoader(model)662 self.trees = cb_loader.get_trees(data=data, data_missing=data_missing)663 self.tree_output = "log_odds"664 self.objective = "binary_crossentropy"665 elif safe_isinstance(model, "catboost.core.CatBoost"):666 assert_import("catboost")667 self.model_type = "catboost"668 self.original_model = model669 elif safe_isinstance(model, "imblearn.ensemble._forest.BalancedRandomForestClassifier"):670 self.input_dtype = np.float32671 scaling = 1.0 / len(model.estimators_) # output is average of trees672 self.trees = [Tree(e.tree_, normalize=True, scaling=scaling, data=data, data_missing=data_missing) for e in model.estimators_]673 self.objective = objective_name_map.get(model.criterion, None)674 self.tree_output = "probability"675 else:676 raise Exception("Model type not yet supported by TreeExplainer: " + str(type(model)))677 678 # build a dense numpy version of all the tree objects679 if self.trees is not None and self.trees:680 max_nodes = np.max([len(t.values) for t in self.trees])681 assert len(np.unique([t.values.shape[1] for t in self.trees])) == 1, "All trees in the ensemble must have the same output dimension!"682 ntrees = len(self.trees)683 self.n_outputs = self.trees[0].values.shape[1]684 # important to be -1 in unused sections!! This way we can tell which entries are valid.685 self.children_left = -np.ones((ntrees, max_nodes), dtype=np.int32)686 self.children_right = -np.ones((ntrees, max_nodes), dtype=np.int32)687 self.children_default = -np.ones((ntrees, max_nodes), dtype=np.int32)688 self.features = -np.ones((ntrees, max_nodes), dtype=np.int32)689 self.thresholds = np.zeros((ntrees, max_nodes), dtype=self.internal_dtype)690 self.values = np.zeros((ntrees, max_nodes, self.trees[0].values.shape[1]), dtype=self.internal_dtype)691 self.node_sample_weight = np.zeros((ntrees, max_nodes), dtype=self.internal_dtype)692 693 for i in range(ntrees):694 l = len(self.trees[i].features)695 self.children_left[i,:l] = self.trees[i].children_left696 self.children_right[i,:l] = self.trees[i].children_right697 self.children_default[i,:l] = self.trees[i].children_default698 self.features[i,:l] = self.trees[i].features699 self.thresholds[i,:l] = self.trees[i].thresholds700 self.values[i,:l,:] = self.trees[i].values701 self.node_sample_weight[i,:l] = self.trees[i].node_sample_weight702 # ensure that the passed background dataset lands in every leaf703 if np.min(self.trees[i].node_sample_weight) <= 0:704 self.fully_defined_weighting = False705 706 # If we should do <= then we nudge the thresholds to make our <= work like <707 if not less_than_or_equal:708 self.thresholds = np.nextafter(self.thresholds, -np.inf)709 710 self.num_nodes = np.array([len(t.values) for t in self.trees], dtype=np.int32)711 self.max_depth = np.max([t.max_depth for t in self.trees])712 def get_transform(self, model_output):713 """ A consistent interface to make predictions from this model.714 """715 if model_output == "margin":716 transform = "identity"717 elif model_output == "probability":718 if self.tree_output == "log_odds":719 transform = "logistic"720 elif self.tree_output == "probability":721 transform = "identity"722 else:723 raise Exception("model_output = \"probability\" is not yet supported when model.tree_output = \"" + self.tree_output + "\"!")724 elif model_output == "logloss":725 if self.objective == "squared_error":726 transform = "squared_loss"727 elif self.objective == "binary_crossentropy":728 transform = "logistic_nlogloss"729 else:730 raise Exception("model_output = \"logloss\" is not yet supported when model.objective = \"" + self.objective + "\"!")731 else:732 assert False, "Unrecognized model_output parameter value: " + model_output733 734 return transform735 def predict(self, X, y=None, output="margin", tree_limit=None):736 """ A consistent interface to make predictions from this model.737 Parameters738 ----------739 tree_limit : None (default) or int 740 Limit the number of trees used by the model. By default None means no use the limit of the741 original model, and -1 means no limit.742 """743 if self.model_type == "pyspark":744 import pyspark745 #TODO support predict for pyspark746 raise NotImplementedError("Predict with pyspark isn't implemented")747 # see if we have a default tree_limit in place.748 if tree_limit is None:749 tree_limit = -1 if self.tree_limit is None else self.tree_limit750 # convert dataframes751 if safe_isinstance(X, "pandas.core.series.Series"):752 X = X.values753 elif safe_isinstance(X, "pandas.core.frame.DataFrame"):754 X = X.values755 flat_output = False756 if len(X.shape) == 1:757 flat_output = True758 X = X.reshape(1, X.shape[0])759 if X.dtype.type != self.input_dtype:760 X = X.astype(self.input_dtype)761 X_missing = np.isnan(X, dtype=np.bool)762 assert isinstance(X, np.ndarray), "Unknown instance type: " + str(type(X))763 assert len(X.shape) == 2, "Passed input data matrix X must have 1 or 2 dimensions!"764 if tree_limit < 0 or tree_limit > self.values.shape[0]:765 tree_limit = self.values.shape[0]766 if output == "logloss":767 assert y is not None, "Both samples and labels must be provided when explaining the loss (i.e. `explainer.shap_values(X, y)`)!"768 assert X.shape[0] == len(y), "The number of labels (%d) does not match the number of samples to explain (%d)!" % (len(y), X.shape[0])769 transform = self.get_transform(output)770 771 if True or self.model_type == "internal":772 output = np.zeros((X.shape[0], self.n_outputs))773 assert_import("cext")774 _cext.dense_tree_predict(775 self.children_left, self.children_right, self.children_default,776 self.features, self.thresholds, self.values,777 self.max_depth, tree_limit, self.base_offset, output_transform_codes[transform], 778 X, X_missing, y, output779 )780 elif self.model_type == "xgboost":781 import xgboost782 output = self.original_model.predict(X, output_margin=True, tree_limit=tree_limit)783 # drop dimensions we don't need784 if flat_output:785 if self.n_outputs == 1:786 return output.flatten()[0]787 else:788 return output.reshape(-1, self.n_outputs)789 else:790 if self.n_outputs == 1:791 return output.flatten()792 else:793 return output794class Tree:795 """ A single decision tree.796 The primary point of this object is to parse many different tree types into a common format.797 """798 def __init__(self, tree, normalize=False, scaling=1.0, data=None, data_missing=None):799 assert_import("cext")800 if safe_isinstance(tree, "sklearn.tree._tree.Tree"):801 self.children_left = tree.children_left.astype(np.int32)802 self.children_right = tree.children_right.astype(np.int32)803 self.children_default = self.children_left # missing values not supported in sklearn804 self.features = tree.feature.astype(np.int32)805 self.thresholds = tree.threshold.astype(np.float64)806 self.values = tree.value.reshape(tree.value.shape[0], tree.value.shape[1] * tree.value.shape[2])807 if normalize:808 self.values = (self.values.T / self.values.sum(1)).T809 self.values = self.values * scaling810 self.node_sample_weight = tree.weighted_n_node_samples.astype(np.float64)811 elif type(tree) is dict and 'features' in tree:812 self.children_left = tree["children_left"].astype(np.int32)813 self.children_right = tree["children_right"].astype(np.int32)814 self.children_default = tree["children_default"].astype(np.int32)815 self.features = tree["features"].astype(np.int32)816 self.thresholds = tree["thresholds"]817 self.values = tree["values"] * scaling818 self.node_sample_weight = tree["node_sample_weight"]819 # deprecated dictionary support (with sklearn singlular style "feature" and "value" names)820 elif type(tree) is dict and 'children_left' in tree:821 self.children_left = tree["children_left"].astype(np.int32)822 self.children_right = tree["children_right"].astype(np.int32)823 self.children_default = tree["children_default"].astype(np.int32)824 self.features = tree["feature"].astype(np.int32)825 self.thresholds = tree["threshold"]826 self.values = tree["value"] * scaling827 self.node_sample_weight = tree["node_sample_weight"]828 elif safe_isinstance(tree, "pyspark.ml.classification.DecisionTreeClassificationModel") \829 or safe_isinstance(tree, "pyspark.ml.regression.DecisionTreeRegressionModel"):830 #model._java_obj.numNodes() doesn't give leaves, need to recompute the size831 def getNumNodes(node, size):832 size = size + 1833 if node.subtreeDepth() == 0:834 return size835 else:836 size = getNumNodes(node.leftChild(), size)837 return getNumNodes(node.rightChild(), size)838 num_nodes = getNumNodes(tree._java_obj.rootNode(), 0)839 self.children_left = np.full(num_nodes, -2, dtype=np.int32)840 self.children_right = np.full(num_nodes, -2, dtype=np.int32)841 self.children_default = np.full(num_nodes, -2, dtype=np.int32)842 self.features = np.full(num_nodes, -2, dtype=np.int32)843 self.thresholds = np.full(num_nodes, -2, dtype=np.float64)844 self.values = [-2]*num_nodes845 self.node_sample_weight = np.full(num_nodes, -2, dtype=np.float64)846 def buildTree(index, node):847 index = index + 1848 if tree._java_obj.getImpurity() == 'variance':849 self.values[index] = [node.prediction()] #prediction for the node850 else:851 self.values[index] = [e for e in node.impurityStats().stats()] #for gini: NDarray(numLabel): 1 per label: number of item for each label which went through this node852 self.node_sample_weight[index] = node.impurityStats().count() #weighted count of element trough this node853 if node.subtreeDepth() == 0:854 return index855 else:856 self.features[index] = node.split().featureIndex() #index of the feature we split on, not available for leaf, int857 if str(node.split().getClass()).endswith('tree.CategoricalSplit'):858 #Categorical split isn't implemented, TODO: could fake it by creating a fake node to split on the exact value?859 raise NotImplementedError('CategoricalSplit are not yet implemented')860 self.thresholds[index] = node.split().threshold() #threshold for the feature, not available for leaf, float861 self.children_left[index] = index + 1862 idx = buildTree(index, node.leftChild())863 self.children_right[index] = idx + 1864 idx = buildTree(idx, node.rightChild())865 return idx866 buildTree(-1, tree._java_obj.rootNode())867 #default Not supported with mlib? (TODO)868 self.children_default = self.children_left869 self.values = np.asarray(self.values)870 if normalize:871 self.values = (self.values.T / self.values.sum(1)).T872 self.values = self.values * scaling873 elif type(tree) == dict and 'tree_structure' in tree:874 start = tree['tree_structure']875 num_parents = tree['num_leaves']-1876 self.children_left = np.empty((2*num_parents+1), dtype=np.int32)877 self.children_right = np.empty((2*num_parents+1), dtype=np.int32)878 self.children_default = np.empty((2*num_parents+1), dtype=np.int32)879 self.features = np.empty((2*num_parents+1), dtype=np.int32)880 self.thresholds = np.empty((2*num_parents+1), dtype=np.float64)881 self.values = [-2]*(2*num_parents+1)882 self.node_sample_weight = np.empty((2*num_parents+1), dtype=np.float64)883 visited, queue = [], [start]884 while queue:885 vertex = queue.pop(0)886 if 'split_index' in vertex.keys():887 if vertex['split_index'] not in visited:888 if 'split_index' in vertex['left_child'].keys():889 self.children_left[vertex['split_index']] = vertex['left_child']['split_index']890 else:891 self.children_left[vertex['split_index']] = vertex['left_child']['leaf_index']+num_parents892 if 'split_index' in vertex['right_child'].keys():893 self.children_right[vertex['split_index']] = vertex['right_child']['split_index']894 else:895 self.children_right[vertex['split_index']] = vertex['right_child']['leaf_index']+num_parents896 if vertex['default_left']:897 self.children_default[vertex['split_index']] = self.children_left[vertex['split_index']]898 else:899 self.children_default[vertex['split_index']] = self.children_right[vertex['split_index']]900 self.features[vertex['split_index']] = vertex['split_feature']901 self.thresholds[vertex['split_index']] = vertex['threshold']902 self.values[vertex['split_index']] = [vertex['internal_value']]903 self.node_sample_weight[vertex['split_index']] = vertex['internal_count']904 visited.append(vertex['split_index'])905 queue.append(vertex['left_child'])906 queue.append(vertex['right_child'])907 else:908 self.children_left[vertex['leaf_index']+num_parents] = -1909 self.children_right[vertex['leaf_index']+num_parents] = -1910 self.children_default[vertex['leaf_index']+num_parents] = -1911 self.features[vertex['leaf_index']+num_parents] = -1912 self.children_left[vertex['leaf_index']+num_parents] = -1913 self.children_right[vertex['leaf_index']+num_parents] = -1914 self.children_default[vertex['leaf_index']+num_parents] = -1915 self.features[vertex['leaf_index']+num_parents] = -1916 self.thresholds[vertex['leaf_index']+num_parents] = -1917 self.values[vertex['leaf_index']+num_parents] = [vertex['leaf_value']]918 self.node_sample_weight[vertex['leaf_index']+num_parents] = vertex['leaf_count']919 self.values = np.asarray(self.values)920 self.values = np.multiply(self.values, scaling)921 922 elif type(tree) == dict and 'nodeid' in tree:923 """ Directly create tree given the JSON dump (with stats) of a XGBoost model.924 """925 def max_id(node):926 if "children" in node:927 return max(node["nodeid"], *[max_id(n) for n in node["children"]])928 else:929 return node["nodeid"]930 931 m = max_id(tree) + 1932 self.children_left = -np.ones(m, dtype=np.int32)933 self.children_right = -np.ones(m, dtype=np.int32)934 self.children_default = -np.ones(m, dtype=np.int32)935 self.features = -np.ones(m, dtype=np.int32)936 self.thresholds = np.zeros(m, dtype=np.float64)937 self.values = np.zeros((m, 1), dtype=np.float64)938 self.node_sample_weight = np.empty(m, dtype=np.float64)939 def extract_data(node, tree):940 i = node["nodeid"]941 tree.node_sample_weight[i] = node["cover"]942 if "children" in node:943 tree.children_left[i] = node["yes"]944 tree.children_right[i] = node["no"]945 tree.children_default[i] = node["missing"]946 tree.features[i] = node["split"]947 tree.thresholds[i] = node["split_condition"]948 for n in node["children"]:949 extract_data(n, tree)950 elif "leaf" in node:951 tree.values[i] = node["leaf"] * scaling952 extract_data(tree, self)953 954 elif type(tree) == str:955 """ Build a tree from a text dump (with stats) of xgboost.956 """957 nodes = [t.lstrip() for t in tree[:-1].split("\n")]958 nodes_dict = {}959 for n in nodes: nodes_dict[int(n.split(":")[0])] = n.split(":")[1]960 m = max(nodes_dict.keys())+1961 children_left = -1*np.ones(m,dtype="int32")962 children_right = -1*np.ones(m,dtype="int32")963 children_default = -1*np.ones(m,dtype="int32")964 features = -2*np.ones(m,dtype="int32")965 thresholds = -1*np.ones(m,dtype="float64")966 values = 1*np.ones(m,dtype="float64")967 node_sample_weight = np.zeros(m,dtype="float64")968 values_lst = list(nodes_dict.values())969 keys_lst = list(nodes_dict.keys())970 for i in range(0,len(keys_lst)):971 value = values_lst[i]972 key = keys_lst[i]973 if ("leaf" in value):974 # Extract values975 val = float(value.split("leaf=")[1].split(",")[0])976 node_sample_weight_val = float(value.split("cover=")[1])977 # Append to lists978 values[key] = val979 node_sample_weight[key] = node_sample_weight_val980 else:981 c_left = int(value.split("yes=")[1].split(",")[0])982 c_right = int(value.split("no=")[1].split(",")[0])983 c_default = int(value.split("missing=")[1].split(",")[0])984 feat_thres = value.split(" ")[0]985 if ("<" in feat_thres):986 feature = int(feat_thres.split("<")[0][2:])987 threshold = float(feat_thres.split("<")[1][:-1])988 if ("=" in feat_thres):989 feature = int(feat_thres.split("=")[0][2:])990 threshold = float(feat_thres.split("=")[1][:-1])991 node_sample_weight_val = float(value.split("cover=")[1].split(",")[0])992 children_left[key] = c_left993 children_right[key] = c_right994 children_default[key] = c_default995 features[key] = feature996 thresholds[key] = threshold997 node_sample_weight[key] = node_sample_weight_val998 999 self.children_left = children_left1000 self.children_right = children_right1001 self.children_default = children_default1002 self.features = features1003 self.thresholds = thresholds1004 self.values = values[:,np.newaxis] * scaling1005 self.node_sample_weight = node_sample_weight1006 else:1007 raise Exception("Unknown input to Tree constructor!")1008 1009 # Re-compute the number of samples that pass through each node if we are given data1010 if data is not None and data_missing is not None:1011 self.node_sample_weight[:] = 0.01012 _cext.dense_tree_update_weights(1013 self.children_left, self.children_right, self.children_default, self.features,1014 self.thresholds, self.values, 1, self.node_sample_weight, data, data_missing1015 )1016 1017 # we compute the expectations to make sure they follow the SHAP logic1018 self.max_depth = _cext.compute_expectations(1019 self.children_left, self.children_right, self.node_sample_weight,1020 self.values1021 )1022class IsoTree(Tree):1023 """ 1024 In sklearn the tree of the Isolation Forest does not calculated in a good way.1025 """1026 def __init__(self, tree, normalize=False, scaling=1.0, data=None, data_missing=None):1027 super(IsoTree, self).__init__(tree, normalize, scaling, data, data_missing)1028 if safe_isinstance(tree, "sklearn.tree._tree.Tree"):1029 from sklearn.ensemble.iforest import _average_path_length1030 def _recalculate_value(tree, i , level):1031 if tree.children_left[i] == -1 and tree.children_right[i] == -1:1032 value = level + _average_path_length(np.array([tree.n_node_samples[i]]))[0]1033 self.values[i, 0] = value1034 return value * tree.n_node_samples[i]1035 else:1036 value_left = _recalculate_value(tree, tree.children_left[i] , level + 1)1037 value_right = _recalculate_value(tree, tree.children_right[i] , level + 1)1038 self.values[i, 0] = (value_left + value_right) / tree.n_node_samples[i]1039 return value_left + value_right1040 _recalculate_value(tree, 0, 0)1041 if normalize:1042 self.values = (self.values.T / self.values.sum(1)).T...
utils.py
Source:utils.py
...767778def is_tree_model(model):79 if type(model) is dict and "trees" in model or \80 safe_isinstance(model,81 ["sklearn.ensemble.RandomForestRegressor", "sklearn.ensemble.forest.RandomForestRegressor"]) \82 or safe_isinstance(model, ["sklearn.ensemble.IsolationForest", "sklearn.ensemble.iforest.IsolationForest"]) \83 or safe_isinstance(model, "skopt.learning.forest.RandomForestRegressor") \84 or safe_isinstance(model,85 ["sklearn.ensemble.ExtraTreesRegressor", "sklearn.ensemble.forest.ExtraTreesRegressor"]) \86 or safe_isinstance(model, "skopt.learning.forest.ExtraTreesRegressor") \87 or safe_isinstance(model, ["sklearn.tree.DecisionTreeRegressor", "sklearn.tree.tree.DecisionTreeRegressor"]) \88 or safe_isinstance(model,89 ["sklearn.tree.DecisionTreeClassifier", "sklearn.tree.tree.DecisionTreeClassifier"]) \90 or safe_isinstance(model, ["sklearn.ensemble.RandomForestClassifier",91 "sklearn.ensemble.forest.RandomForestClassifier"]) \92 or safe_isinstance(model, ["sklearn.ensemble.ExtraTreesClassifier",93 "sklearn.ensemble.forest.ExtraTreesClassifier"]) \94 or safe_isinstance(model, ["sklearn.ensemble.GradientBoostingRegressor",95 "sklearn.ensemble.gradient_boosting.GradientBoostingRegressor"]) \96 or safe_isinstance(model, ["sklearn.ensemble.GradientBoostingClassifier",97 "sklearn.ensemble.gradient_boosting.GradientBoostingClassifier"]) \98 or safe_isinstance(model, "xgboost.core.Booster") \99 or safe_isinstance(model, "xgboost.sklearn.XGBClassifier") \100 or safe_isinstance(model, "xgboost.sklearn.XGBRegressor") \101 or safe_isinstance(model, "xgboost.sklearn.XGBRanker") \102 or safe_isinstance(model, "lightgbm.basic.Booster") \103 or safe_isinstance(model, "lightgbm.sklearn.LGBMRegressor") \104 or safe_isinstance(model, "lightgbm.sklearn.LGBMRanker") \105 or safe_isinstance(model, "lightgbm.sklearn.LGBMClassifier") \106 or safe_isinstance(model, "catboost.core.CatBoostRegressor") \107 or safe_isinstance(model, "catboost.core.CatBoostClassifier") \108 or safe_isinstance(model, "catboost.core.CatBoost") \109 or safe_isinstance(model, "imblearn.ensemble._forest.BalancedRandomForestClassifier"):110 return True111 else:112 return False113114115def check_empty(d, errstr ='the input is empty'):116 if d is None:117 raise ValueError(errstr)118119# binning function120def bin_me(act, pred, n_bins):121 "bin values in arrays act and pred into (n_bins+1) bins and return aggregated values in a data frame"122123 n = act.size
...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!