自定义交互#
在本 notebook 中,我们将禁用内置于 interpret API 中的自动交互检测功能,而是自行检测交互,然后将它们纳入 EBM 中。我们还将检测和使用三阶交互,这通常不是必需的,但有时会很有用。
本 notebook 位于 GitHub 上的 示例文件夹 中。
# install interpret if not already installed
try:
import interpret
except ModuleNotFoundError:
!pip install --quiet interpret pandas scikit-learn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import set_visualize_provider
from interpret.provider import InlineProvider
set_visualize_provider(InlineProvider())
df = pd.read_csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
header=None)
df.columns = [
"Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
"MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
"CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
seed = 42
np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)
构建主效应模型
ebm1 = ExplainableBoostingClassifier(random_state=seed, interactions=0)
ebm1.fit(X_train, y_train)
ExplainableBoostingClassifier(interactions=0)在 Jupyter 环境中,请重新运行此单元格以显示 HTML 表示或信任该 notebook。
在 GitHub 上,HTML 表示无法渲染,请尝试使用 nbviewer.org 加载此页面。
ExplainableBoostingClassifier(interactions=0)
确定对(二阶交互)
from interpret.utils import measure_interactions
from itertools import combinations
n_features = X_train.shape[1]
pairs = measure_interactions(X_train, y_train, interactions=combinations(range(n_features), 2), init_score=ebm1)
pairs = [interaction for interaction, strength in pairs[:10]] # select the top 10 pairs
构建纯对(二阶交互)模型
ebm2 = ExplainableBoostingClassifier(random_state=seed, exclude="mains", interactions=pairs)
ebm2.fit(X_train, y_train, init_score=ebm1)
# modify ebm2 slightly to not have any bins without type definitions
ebm2.bins_ = [l1 if len(l2) == 0 else l2 for l1, l2 in zip(ebm1.bins_, ebm2.bins_)]
将主效应和纯对(二阶交互)合并到单个模型中
from interpret.glassbox import merge_ebms
ebm_pairs = merge_ebms([ebm1, ebm2])
# There is no overlap between these EBMs, so merge_ebms will consider
# the non-overlapping terms as having zeros for scores in the other model.
# Undo this by multiplying the scores by 2.0. Also reduce the bin_weights_
# since we're merging the same underlying features.
for i in range(len(ebm_pairs.term_features_)):
ebm_pairs.scale(i, 2.0)
ebm_pairs.bin_weights_[i] *= 0.5
# add intercepts since we're not trying to average the models
ebm_pairs.intercept_ = ebm1.intercept_ + ebm2.intercept_
ebm_pairs.bagged_intercept_ = None
ebm_pairs.bagged_scores_ = None
ebm_pairs.standard_deviations_ = None
确定三阶交互
triples = measure_interactions(X_train, y_train, interactions=combinations(range(n_features), 3), init_score=ebm_pairs)
triples = [interaction for interaction, strength in triples[:10]] # select the top 10 triples
构建纯三阶交互 EBM
ebm3 = ExplainableBoostingClassifier(random_state=seed, exclude="mains", interactions=triples)
ebm3.fit(X_train, y_train, init_score=ebm_pairs)
# modify ebm3 slightly to not have any bins without type definitions
ebm3.bins_ = [l1 if len(l3) == 0 else l3 for l1, l3 in zip(ebm1.bins_, ebm3.bins_)]
/opt/hostedtoolcache/Python/3.9.21/x64/lib/python3.9/site-packages/interpret/glassbox/_ebm/_ebm.py:1343: UserWarning: Interactions with 3 or more terms are not graphed in global explanations. Local explanations are still available and exact.
warn(
将主效应、对(二阶交互)和三阶交互合并到单个模型中
ebm_triples = merge_ebms([ebm1, ebm2, ebm3])
# There is no overlap between these EBMs, so merge_ebms will consider
# the non-overlappig terms as having zeros for scores in the other model.
# Undo this by multiplying the scores by 3.0. Also reduce the bin_weights_
# since we're merging the same underlying features.
for i in range(len(ebm_triples.term_features_)):
ebm_triples.scale(i, 3.0)
ebm_triples.bin_weights_[i] *= 1.0/3.0
# add intercepts since we're not trying to average the models
ebm_triples.intercept_ = ebm1.intercept_ + ebm2.intercept_ + ebm3.intercept_
ebm_triples.bagged_intercept_ = None
ebm_triples.bagged_scores_ = None
ebm_triples.standard_deviations_ = None
评估 EBMs
from sklearn.metrics import log_loss
loss1 = log_loss(y_test, ebm1.predict_proba(X_test))
print(loss1)
loss2 = log_loss(y_test, ebm_pairs.predict_proba(X_test))
print(loss2)
# compare our custom pair EBM with an EBM built to auto-discover the pairs
ebm_default = ExplainableBoostingClassifier(random_state=seed, interactions=10)
ebm_default.fit(X_train, y_train)
loss2_default = log_loss(y_test, ebm_default.predict_proba(X_test))
print(loss2_default)
loss3 = log_loss(y_test, ebm_triples.predict_proba(X_test))
print(loss3)
0.2737982347532057
0.2736888414781412
0.27346458571152205
0.27368958880625815