自定义交互#

在本 notebook 中,我们将禁用内置于 interpret API 中的自动交互检测功能,而是自行检测交互,然后将它们纳入 EBM 中。我们还将检测和使用三阶交互,这通常不是必需的,但有时会很有用。

本 notebook 位于 GitHub 上的 示例文件夹 中。

# install interpret if not already installed
try:
    import interpret
except ModuleNotFoundError:
    !pip install --quiet interpret pandas scikit-learn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from interpret.glassbox import ExplainableBoostingClassifier

from interpret import set_visualize_provider
from interpret.provider import InlineProvider
set_visualize_provider(InlineProvider())

df = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
    header=None)
df.columns = [
    "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
    "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
    "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
]
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

seed = 42
np.random.seed(seed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)

构建主效应模型

ebm1 = ExplainableBoostingClassifier(random_state=seed, interactions=0)
ebm1.fit(X_train, y_train)
ExplainableBoostingClassifier(interactions=0)
在 Jupyter 环境中,请重新运行此单元格以显示 HTML 表示或信任该 notebook。
在 GitHub 上,HTML 表示无法渲染,请尝试使用 nbviewer.org 加载此页面。

确定对(二阶交互)

from interpret.utils import measure_interactions
from itertools import combinations

n_features = X_train.shape[1]

pairs = measure_interactions(X_train, y_train, interactions=combinations(range(n_features), 2), init_score=ebm1)
pairs = [interaction for interaction, strength in pairs[:10]]  # select the top 10 pairs

构建纯对(二阶交互)模型

ebm2 = ExplainableBoostingClassifier(random_state=seed, exclude="mains", interactions=pairs)
ebm2.fit(X_train, y_train, init_score=ebm1)

# modify ebm2 slightly to not have any bins without type definitions
ebm2.bins_ = [l1 if len(l2) == 0 else l2 for l1, l2 in zip(ebm1.bins_, ebm2.bins_)]

将主效应和纯对(二阶交互)合并到单个模型中

from interpret.glassbox import merge_ebms

ebm_pairs = merge_ebms([ebm1, ebm2])

# There is no overlap between these EBMs, so merge_ebms will consider
# the non-overlapping terms as having zeros for scores in the other model. 
# Undo this by multiplying the scores by 2.0. Also reduce the bin_weights_ 
# since we're merging the same underlying features.
for i in range(len(ebm_pairs.term_features_)):
    ebm_pairs.scale(i, 2.0)
    ebm_pairs.bin_weights_[i] *= 0.5

# add intercepts since we're not trying to average the models
ebm_pairs.intercept_ = ebm1.intercept_ + ebm2.intercept_

ebm_pairs.bagged_intercept_ = None
ebm_pairs.bagged_scores_ = None
ebm_pairs.standard_deviations_ = None

确定三阶交互

triples = measure_interactions(X_train, y_train, interactions=combinations(range(n_features), 3), init_score=ebm_pairs)
triples = [interaction for interaction, strength in triples[:10]]  # select the top 10 triples

构建纯三阶交互 EBM

ebm3 = ExplainableBoostingClassifier(random_state=seed, exclude="mains", interactions=triples)
ebm3.fit(X_train, y_train, init_score=ebm_pairs)

# modify ebm3 slightly to not have any bins without type definitions
ebm3.bins_ = [l1 if len(l3) == 0 else l3 for l1, l3 in zip(ebm1.bins_, ebm3.bins_)]
/opt/hostedtoolcache/Python/3.9.21/x64/lib/python3.9/site-packages/interpret/glassbox/_ebm/_ebm.py:1343: UserWarning: Interactions with 3 or more terms are not graphed in global explanations. Local explanations are still available and exact.
  warn(

将主效应、对(二阶交互)和三阶交互合并到单个模型中

ebm_triples = merge_ebms([ebm1, ebm2, ebm3])

# There is no overlap between these EBMs, so merge_ebms will consider
# the non-overlappig terms as having zeros for scores in the other model. 
# Undo this by multiplying the scores by 3.0. Also reduce the bin_weights_ 
# since we're merging the same underlying features.
for i in range(len(ebm_triples.term_features_)):
    ebm_triples.scale(i, 3.0)
    ebm_triples.bin_weights_[i] *= 1.0/3.0

# add intercepts since we're not trying to average the models
ebm_triples.intercept_ = ebm1.intercept_ + ebm2.intercept_ + ebm3.intercept_

ebm_triples.bagged_intercept_ = None
ebm_triples.bagged_scores_ = None
ebm_triples.standard_deviations_ = None

评估 EBMs

from sklearn.metrics import log_loss

loss1 = log_loss(y_test, ebm1.predict_proba(X_test))
print(loss1)

loss2 = log_loss(y_test, ebm_pairs.predict_proba(X_test))
print(loss2)

# compare our custom pair EBM with an EBM built to auto-discover the pairs
ebm_default = ExplainableBoostingClassifier(random_state=seed, interactions=10)
ebm_default.fit(X_train, y_train)
loss2_default = log_loss(y_test, ebm_default.predict_proba(X_test))
print(loss2_default)

loss3 = log_loss(y_test, ebm_triples.predict_proba(X_test))
print(loss3)
0.2737982347532057
0.2736888414781412
0.27346458571152205
0.27368958880625815