跳转至

快速上手

5 分钟跑通一条完整的评分卡训练流水线。本节使用 合成数据 演示,不依赖任何外部数据源。

0. 准备

export PYTHONPATH="${PYTHONPATH}:$(pwd)"
# 假设当前在 SuperModelingFactory 仓库根目录

如果要运行模型解释示例,请安装可选解释依赖:

pip install 'supermodelingfactory[explain]'

1. 造一份合成样本

import numpy as np
import pandas as pd

rng = np.random.default_rng(42)
n = 5000

data = pd.DataFrame({
    "user_id":    np.arange(n),
    "age":        rng.normal(35, 8, n).clip(18, 70),
    "income":     rng.lognormal(10, 0.4, n),
    "score_b":    rng.normal(600, 60, n),
    "city_grade": rng.choice(["A", "B", "C", "D"], n),
    "n_overdue":  rng.poisson(0.3, n),
})

# 合成一个与特征相关的坏样本率
logit = -6 + 0.02 * (data["score_b"] - 600) + 0.001 * (data["income"] - data["income"].mean())
prob = 1 / (1 + np.exp(-logit))
data["bad_flag"] = rng.binomial(1, prob)

print(data.head())
print("坏样本率:", data["bad_flag"].mean())

2. 样本切分

from Modeling_Tool import SampleSplitter

features = ["age", "income", "score_b", "city_grade", "n_overdue"]
splitter = SampleSplitter(test_size=0.3, random_state=42, stratify=True)
train_df, test_df = splitter.split_df(data, target="bad_flag")
print(f"train={len(train_df)}  test={len(test_df)}")

3. WOE 编码

from Modeling_Tool import WOE_Master

woe = WOE_Master(train_data=train_df, varlist=features, dep="bad_flag")
woe.fit(nbins=10, equal_freq=True)

train_woe = woe.transform(train_df)
test_woe  = woe.transform(test_df)

print(train_woe[[f"{f}_woe" for f in features]].head())

4. 训练逻辑回归

from Modeling_Tool import LRMaster

woe_features = [f"{f}_woe" for f in features]

lr = LRMaster(params={"C": 1.0, "max_iter": 1000, "solver": "lbfgs"})
# fit 接收 (data, varlist, tgt_name)
lr.fit(train_woe, woe_features, "bad_flag")

coef = lr.get_statsmodel_summary()
print(coef)

5. 训练 LightGBM

from Modeling_Tool import GradientBoostingModel

gbm = GradientBoostingModel(
    "lgb",
    params={
        "n_estimators": 200,
        "learning_rate": 0.05,
        "max_depth": 4,
        "early_stopping_rounds": 20,
        "eval_metric": "auc",
    },
)
gbm.fit(
    train_woe[woe_features], train_woe["bad_flag"],
    test_woe[woe_features],  test_woe["bad_flag"],
)

也可以一行切换到 XGBoost / CatBoost

GradientBoostingModel 是统一接口,把 "lgb" 换成 "xgb""cat" 即可在不改其余代码的 情况下训练 XGBoost / CatBoost,例如 GradientBoostingModel("cat", {"n_estimators": 200})

6. 模型评估

from Modeling_Tool import PerformanceEvaluator

evaluator = PerformanceEvaluator(
    tgt_name="bad_flag",
    model=gbm._model.model,
    feature_cols=woe_features,
)
evaluator.add_dataset("train", train_woe).add_dataset("test", test_woe)
perf = evaluator.evaluate()

print(perf[["index", "KS", "AUC", "Top10%_TargetRate"]])

样本带权重?

若 DataFrame 含 sample_wgt 等权重列,在 fit / PerformanceEvaluator / GainsTableCalculator 等处传入 weight_col="sample_wgt" 即可;训练与评估请使用同一列名。详见 模型训练 — 样本权重模型评估 — 样本权重评估

7. 模型解释

from Modeling_Tool import ModelExplainer

explain_x = test_woe[woe_features]
background_x = train_woe[woe_features].sample(n=min(1000, len(train_woe)), random_state=42)
focus_feature = woe_features[0]

explainer = ModelExplainer(gbm, background_data=background_x)

# SHAP
explainer.explain(explain_x)
shap_importance = explainer.feature_importance(normalize=True)
local_shap = explainer.explain_instance(explain_x.iloc[[0]])

# PDP / ICE / ALE
pdp_curve = explainer.partial_dependence(explain_x, focus_feature, sample_size=1000, random_state=42)
ice_curve = explainer.ice(explain_x, focus_feature, sample_size=100, centered=True, random_state=42)
ale_curve = explainer.ale(explain_x, focus_feature, bins=20)

# LIME
lime_local = explainer.lime_explain_instance(
    explain_x.iloc[0],
    X_train=background_x,
    num_features=10,
    num_samples=3000,
    random_state=42,
)
lime_global = explainer.lime_global_importance(
    explain_x,
    X_train=background_x,
    sample_size=50,
    num_features=10,
    num_samples=1000,
    random_state=42,
)

print(shap_importance.head(10))
print(pdp_curve.head())
print(ice_curve.head())
print(ale_curve.head())
print(lime_local.head(10))
print(lime_global.head(10))

8. 生成 Excel 报告

from ExcelMaster.ExcelMaster import ExcelMaster

em = ExcelMaster("model_report.xlsx", verbose=False)
ws = em.add_worksheet("Performance")

em.merge_col(ws, ncols=5, text="LightGBM 模型性能汇总")
em.write_dataframe(
    ws, perf,
    title="性能指标",
    titleformat="BLUE_H2",
    headerformat="ORANGE_H4",
    valueformat="NUM%.4",
)
em.close_workbook()

print("已生成 model_report.xlsx")

完整脚本

将以上片段合并:

import numpy as np
import pandas as pd
from Modeling_Tool import (
    SampleSplitter, WOE_Master, LRMaster,
    GradientBoostingModel, PerformanceEvaluator, ModelExplainer,
)
from ExcelMaster.ExcelMaster import ExcelMaster

# 1) 数据
rng = np.random.default_rng(42)
n = 5000
data = pd.DataFrame({
    "user_id":    np.arange(n),
    "age":        rng.normal(35, 8, n).clip(18, 70),
    "income":     rng.lognormal(10, 0.4, n),
    "score_b":    rng.normal(600, 60, n),
    "city_grade": rng.choice(["A", "B", "C", "D"], n),
    "n_overdue":  rng.poisson(0.3, n),
})
logit = -6 + 0.02 * (data["score_b"] - 600) + 0.001 * (data["income"] - data["income"].mean())
data["bad_flag"] = rng.binomial(1, 1 / (1 + np.exp(-logit)))

features = ["age", "income", "score_b", "city_grade", "n_overdue"]

# 2) 切分
train_df, test_df = SampleSplitter(test_size=0.3, random_state=42, stratify=True) \
                     .split_df(data, target="bad_flag")

# 3) WOE
woe = WOE_Master(train_data=train_df, varlist=features, dep="bad_flag")
woe.fit(nbins=10, equal_freq=True)
train_woe, test_woe = woe.transform(train_df), woe.transform(test_df)
woe_features = [f"{f}_woe" for f in features]

# 4) LightGBM
gbm = GradientBoostingModel("lgb", {"n_estimators": 200, "learning_rate": 0.05})
gbm.fit(train_woe[woe_features], train_woe["bad_flag"],
        test_woe[woe_features],  test_woe["bad_flag"])

# 5) 评估
perf = PerformanceEvaluator(
    tgt_name="bad_flag",
    model=gbm._model.model,
    feature_cols=woe_features,
).add_dataset("train", train_woe).add_dataset("test", test_woe).evaluate()

# 6) 解释
explain_x = test_woe[woe_features]
background_x = train_woe[woe_features].sample(n=min(1000, len(train_woe)), random_state=42)
focus_feature = woe_features[0]
explainer = ModelExplainer(gbm, background_data=background_x)

explainer.explain(explain_x)
shap_importance = explainer.feature_importance(normalize=True)
pdp_curve = explainer.partial_dependence(explain_x, focus_feature, sample_size=1000, random_state=42)
ice_curve = explainer.ice(explain_x, focus_feature, sample_size=100, centered=True, random_state=42)
ale_curve = explainer.ale(explain_x, focus_feature, bins=20)
lime_local = explainer.lime_explain_instance(explain_x.iloc[0], X_train=background_x, num_features=10)
lime_global = explainer.lime_global_importance(explain_x, X_train=background_x, sample_size=50)

# 7) 报告
em = ExcelMaster("model_report.xlsx", verbose=False)
ws = em.add_worksheet("Performance")
em.write_dataframe(ws, perf, title="模型性能",
                   titleformat="BLUE_H2",
                   headerformat="ORANGE_H4",
                   valueformat="NUM%.4")
em.close_workbook()

下一步

  • 想了解每一步的更多选项,请阅读 端到端流水线
  • 想深入模型解释方法,请阅读 模型解释
  • 想看所有公开 API 的详细说明,请跳转到 API 参考
  • 想直接生成可直接部署的脚本,请参考各 用户指南 中的代码片段