跳转至

Modeling_Tool.Model

模型训练层 —— LR、LightGBM、XGBoost、后向变量消元。

逻辑回归 — LRM_Tool

LRM_Tool

FeatureSelectionAnalyzer

Feature selection analyzer using statistical tests.

Analyzes feature relevance using chi-squared tests, correlation analysis, and variance inflation factor (VIF) for multicollinearity detection.

参数:

名称 类型 描述 默认
significance_level float

Significance level for statistical tests

0.05

示例:

>>> analyzer = FeatureSelectionAnalyzer(significance_level=0.05)
>>> results = analyzer.chi2_selection(train_df, feature_cols, 'target')
>>> vif_df = analyzer.compute_vif(train_df[feature_cols])
源代码位于: Modeling_Tool/Model/LRM_Tool.py
class FeatureSelectionAnalyzer:
    """
    Feature selection analyzer using statistical tests.

    Analyzes feature relevance using chi-squared tests, correlation analysis,
    and variance inflation factor (VIF) for multicollinearity detection.

    Parameters
    ----------
    significance_level : float, default 0.05
        Significance level for statistical tests

    Examples
    --------
    >>> analyzer = FeatureSelectionAnalyzer(significance_level=0.05)
    >>> results = analyzer.chi2_selection(train_df, feature_cols, 'target')
    >>> vif_df = analyzer.compute_vif(train_df[feature_cols])
    """

    def __init__(self, significance_level=0.05):
        """
        Initialize FeatureSelectionAnalyzer.

        Parameters
        ----------
        significance_level : float, default 0.05
            Significance level threshold for feature selection
        """
        self.significance_level = significance_level
        self.selected_features_ = None
        self.chi2_results_ = None

    def chi2_selection(self, data, feature_cols, target_col):
        """
        Select features using chi-squared test.

        Parameters
        ----------
        data : pd.DataFrame
            Input data
        feature_cols : list of str
            Feature column names to evaluate
        target_col : str
            Target variable column name

        Returns
        -------
        pd.DataFrame
            Results with columns ['feature', 'chi2', 'p_value', 'selected']
        """
        from sklearn.feature_selection import chi2
        from sklearn.preprocessing import MinMaxScaler

        x = data[feature_cols].fillna(0)
        y = data[target_col]

        scaler = MinMaxScaler()
        x_scaled = scaler.fit_transform(x)

        chi2_vals, p_vals = chi2(x_scaled, y)

        results = pd.DataFrame({
            'feature': feature_cols,
            'chi2': chi2_vals,
            'p_value': p_vals,
            'selected': p_vals < self.significance_level
        }).sort_values('chi2', ascending=False).reset_index(drop=True)

        self.chi2_results_ = results
        self.selected_features_ = results.loc[results['selected'], 'feature'].tolist()
        return results

    def compute_vif(self, data):
        """
        Compute Variance Inflation Factor (VIF) for multicollinearity detection.

        Parameters
        ----------
        data : pd.DataFrame
            Feature matrix (should not include target variable)

        Returns
        -------
        pd.DataFrame
            DataFrame with columns ['feature', 'VIF'] sorted by VIF descending
        """
        from statsmodels.stats.outliers_influence import variance_inflation_factor

        x = data.fillna(0).values
        vif_data = pd.DataFrame({
            'feature': data.columns,
            'VIF': [variance_inflation_factor(x, i) for i in range(x.shape[1])]
        }).sort_values('VIF', ascending=False).reset_index(drop=True)

        return vif_data

    def correlation_filter(self, data, threshold=0.8):
        """
        Remove highly correlated features.

        Parameters
        ----------
        data : pd.DataFrame
            Feature matrix
        threshold : float, default 0.8
            Correlation threshold above which features are removed

        Returns
        -------
        list of str
            List of features to keep (low correlation subset)
        """
        corr_matrix = data.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        to_drop = [col for col in upper.columns if any(upper[col] > threshold)]
        return [col for col in data.columns if col not in to_drop]

chi2_selection

chi2_selection(data, feature_cols, target_col)

Select features using chi-squared test.

参数:

名称 类型 描述 默认
data DataFrame

Input data

必需
feature_cols list of str

Feature column names to evaluate

必需
target_col str

Target variable column name

必需

返回:

类型 描述
DataFrame

Results with columns ['feature', 'chi2', 'p_value', 'selected']

源代码位于: Modeling_Tool/Model/LRM_Tool.py
def chi2_selection(self, data, feature_cols, target_col):
    """
    Select features using chi-squared test.

    Parameters
    ----------
    data : pd.DataFrame
        Input data
    feature_cols : list of str
        Feature column names to evaluate
    target_col : str
        Target variable column name

    Returns
    -------
    pd.DataFrame
        Results with columns ['feature', 'chi2', 'p_value', 'selected']
    """
    from sklearn.feature_selection import chi2
    from sklearn.preprocessing import MinMaxScaler

    x = data[feature_cols].fillna(0)
    y = data[target_col]

    scaler = MinMaxScaler()
    x_scaled = scaler.fit_transform(x)

    chi2_vals, p_vals = chi2(x_scaled, y)

    results = pd.DataFrame({
        'feature': feature_cols,
        'chi2': chi2_vals,
        'p_value': p_vals,
        'selected': p_vals < self.significance_level
    }).sort_values('chi2', ascending=False).reset_index(drop=True)

    self.chi2_results_ = results
    self.selected_features_ = results.loc[results['selected'], 'feature'].tolist()
    return results

compute_vif

compute_vif(data)

Compute Variance Inflation Factor (VIF) for multicollinearity detection.

参数:

名称 类型 描述 默认
data DataFrame

Feature matrix (should not include target variable)

必需

返回:

类型 描述
DataFrame

DataFrame with columns ['feature', 'VIF'] sorted by VIF descending

源代码位于: Modeling_Tool/Model/LRM_Tool.py
def compute_vif(self, data):
    """
    Compute Variance Inflation Factor (VIF) for multicollinearity detection.

    Parameters
    ----------
    data : pd.DataFrame
        Feature matrix (should not include target variable)

    Returns
    -------
    pd.DataFrame
        DataFrame with columns ['feature', 'VIF'] sorted by VIF descending
    """
    from statsmodels.stats.outliers_influence import variance_inflation_factor

    x = data.fillna(0).values
    vif_data = pd.DataFrame({
        'feature': data.columns,
        'VIF': [variance_inflation_factor(x, i) for i in range(x.shape[1])]
    }).sort_values('VIF', ascending=False).reset_index(drop=True)

    return vif_data

correlation_filter

correlation_filter(data, threshold=0.8)

Remove highly correlated features.

参数:

名称 类型 描述 默认
data DataFrame

Feature matrix

必需
threshold float

Correlation threshold above which features are removed

0.8

返回:

类型 描述
list of str

List of features to keep (low correlation subset)

源代码位于: Modeling_Tool/Model/LRM_Tool.py
def correlation_filter(self, data, threshold=0.8):
    """
    Remove highly correlated features.

    Parameters
    ----------
    data : pd.DataFrame
        Feature matrix
    threshold : float, default 0.8
        Correlation threshold above which features are removed

    Returns
    -------
    list of str
        List of features to keep (low correlation subset)
    """
    corr_matrix = data.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > threshold)]
    return [col for col in data.columns if col not in to_drop]

LRMaster

Logistic Regression Master Class.

A unified wrapper for logistic regression modeling that encapsulates: - Model training and prediction - Variable importance analysis - Statistical summary generation - Stepwise variable selection - Holdout-based hyperparameter grid search - AIC/BIC calculation - Optional feature standardization (off by default)

参数:

名称 类型 描述 默认
params dict

Parameters for sklearn LogisticRegression, e.g., {'C': 1.0, 'solver': 'lbfgs'}

None

属性:

名称 类型 描述
params dict

Model parameters

model LogisticRegression

Trained model (None until fit() is called)

varlist list

List of feature names

tgt_name str

Target variable name

standardize bool

Whether feature standardization is enabled

standardizer sklearn-like scaler or None

Fitted scaler (None until fit() runs with standardize=True)

best_params_ dict or None

Best hyperparameters from grid_search_params (None until it runs)

search_results_ DataFrame or None

Full grid_search_params results table (None until it runs)

示例:

>>> lr = LRMaster(params={'C': 1.0, 'solver': 'lbfgs'})
>>> lr.fit(train_df, ['age', 'income'], 'target')
>>> predictions = lr.predict(test_df)
>>> importance = lr.get_variable_importance()
>>> # With standardization (defaults to StandardScaler)
>>> lr = LRMaster(params={'C': 1.0}, standardize=True)
>>> lr.fit(train_df, ['age', 'income'], 'target')
>>> proba = lr.predict_proba(test_df)  # test_df is scaled with the fitted scaler
源代码位于: Modeling_Tool/Model/LRM_Tool.py
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
class LRMaster:
    """
    Logistic Regression Master Class.

    A unified wrapper for logistic regression modeling that encapsulates:
    - Model training and prediction
    - Variable importance analysis
    - Statistical summary generation
    - Stepwise variable selection
    - Holdout-based hyperparameter grid search
    - AIC/BIC calculation
    - Optional feature standardization (off by default)

    Parameters
    ----------
    params : dict, optional
        Parameters for sklearn LogisticRegression, e.g., {'C': 1.0, 'solver': 'lbfgs'}

    Attributes
    ----------
    params : dict
        Model parameters
    model : sklearn.linear_model.LogisticRegression
        Trained model (None until fit() is called)
    varlist : list
        List of feature names
    tgt_name : str
        Target variable name
    standardize : bool
        Whether feature standardization is enabled
    standardizer : sklearn-like scaler or None
        Fitted scaler (None until fit() runs with standardize=True)
    best_params_ : dict or None
        Best hyperparameters from grid_search_params (None until it runs)
    search_results_ : pandas.DataFrame or None
        Full grid_search_params results table (None until it runs)

    Examples
    --------
    >>> lr = LRMaster(params={'C': 1.0, 'solver': 'lbfgs'})
    >>> lr.fit(train_df, ['age', 'income'], 'target')
    >>> predictions = lr.predict(test_df)
    >>> importance = lr.get_variable_importance()

    >>> # With standardization (defaults to StandardScaler)
    >>> lr = LRMaster(params={'C': 1.0}, standardize=True)
    >>> lr.fit(train_df, ['age', 'income'], 'target')
    >>> proba = lr.predict_proba(test_df)  # test_df is scaled with the fitted scaler
    """

    def __init__(self, params=None, model=None, varlist=None, tgt_name=None,
                 standardize=False, scaler=None):
        """
        Initialize LRMaster instance.

        Parameters
        ----------
        params : dict, optional
            LogisticRegression parameters
        model : sklearn-like LogisticRegression object, optional
            Existing fitted LR model object. If provided, LRMaster will wrap this model directly.
        varlist : list, optional
            Feature names used by the existing model. Required when model does not have
            `feature_names_in_`.
        tgt_name : str, optional
            Target variable name. Useful when wrapping an existing fitted model and later
            calling summary/evaluation methods.
        standardize : bool, default False
            If True, fit a scaler on the training features during `fit` /
            `stepwise_selection` and apply it consistently in every prediction /
            evaluation entry point. Default False keeps the original behavior
            (no standardization) for full backward compatibility.
        scaler : sklearn-like transformer, optional
            Custom scaler prototype to use when `standardize=True` (e.g.
            `MinMaxScaler()`). The prototype is cloned before fitting, so the
            passed instance is never mutated. Defaults to `StandardScaler` when
            not provided.
        """
        self.params = _sanitize_lr_params(params)
        self.model = model
        self.calibrated_model = None
        self.varlist = varlist
        self.tgt_name = tgt_name
        self._data = None
        self.standardize = standardize
        # Unfitted prototype used to derive the fitted scaler during fit().
        self._scaler_proto = scaler
        # Fitted scaler; stays None until fit()/stepwise runs with standardize=True.
        self.standardizer = None
        # Populated by grid_search_params(): best param dict + full results table.
        self.best_params_ = None
        self.search_results_ = None

    def _make_scaler(self):
        """
        Return a fresh, unfitted scaler instance for standardization.

        Uses the user-provided `scaler` prototype when given (cloned so the
        original stays unfitted); otherwise defaults to `StandardScaler`.
        """
        from sklearn.preprocessing import StandardScaler
        if self._scaler_proto is not None:
            from sklearn.base import clone as _sk_clone
            return _sk_clone(self._scaler_proto)
        return StandardScaler()

    def _fit_standardizer(self, x):
        """
        Fit a scaler on `x` and store it as `self.standardizer`.

        Returns the standardized `x` (a DataFrame when `x` is one). When
        `self.standardize` is False this is a no-op that clears any scaler and
        returns `x` unchanged.
        """
        if not self.standardize:
            self.standardizer = None
            return x
        scaler = self._make_scaler()
        scaler.fit(x)
        self.standardizer = scaler
        return self._apply_standardizer(x)

    def _apply_standardizer(self, x):
        """
        Apply the fitted standardizer to `x`, preserving DataFrame layout.

        No-op when no fitted standardizer is present (e.g. `standardize=False`
        or when wrapping an externally fitted model), so existing behavior is
        unchanged unless standardization was explicitly enabled and fitted.
        """
        if self.standardizer is None:
            return x
        values = self.standardizer.transform(x)
        if hasattr(x, 'columns'):
            return pd.DataFrame(values, columns=x.columns, index=x.index)
        return values

    def set_data(self, data):
        """
        Store reference data for later use (e.g., calibration).

        Parameters
        ----------
        data : pd.DataFrame
            Training data to store

        Returns
        -------
        self
        """
        self._data = data
        return self

    def fit(self, data, varlist, tgt_name, val_data=None, val_varlist=None, val_tgt_name=None, weight_col=None):
        """
        Train the logistic regression model.

        When `standardize=True`, a scaler is fitted on the training features and
        stored as `self.standardizer`; the model is then trained on the scaled
        features. The same scaler is reused at prediction / evaluation time.

        Parameters
        ----------
        data : pd.DataFrame
            Training dataset containing features and target
        varlist : list of str
            Feature column names to use for training
        tgt_name : str
            Target variable column name
        val_data : pd.DataFrame, optional
            Validation dataset (currently used for reference; not used in fitting)
        val_varlist : list of str, optional
            Validation feature column names
        val_tgt_name : str, optional
            Validation target variable column name
        weight_col : str, optional
            Column in ``data`` with per-sample training weights (non-negative).
            Mutually exclusive with passing ``sample_weight`` to lower-level helpers.

        Returns
        -------
        self
        """
        self.varlist = varlist
        self.tgt_name = tgt_name
        self._data = data

        train_x = data[varlist]
        if self.standardize:
            train_x = self._fit_standardizer(train_x)
        else:
            self.standardizer = None

        val_x = val_data[val_varlist] if val_data is not None and val_varlist is not None else None
        val_y = val_data[val_tgt_name] if val_data is not None and val_tgt_name is not None else None
        if val_x is not None:
            val_x = self._apply_standardizer(val_x)

        sample_weight = resolve_sample_weight(data=data, weight_col=weight_col, expected_len=len(data))
        self.model = lr_model(train_x, data[tgt_name], val_x, val_y, self.params, sample_weight=sample_weight)
        return self

    def calibrate_model(self, model=None, train_df=None, method='sigmoid', cv=5, weight_col=None, sample_weight=None):
        """Model calibration with optional sample weights."""
        from sklearn.calibration import CalibratedClassifierCV
        from sklearn.base import clone

        if train_df is None:
            train_df = self._data

        if model is None:
            model = self.model

        if hasattr(model, "feature_names_in_"):
            varlist = model.feature_names_in_.tolist()
        elif self.varlist is not None:
            varlist = self.varlist
        else:
            raise ValueError(
                "Cannot infer feature list from model. Please provide `varlist` when initializing LRMaster."
            )

        if hasattr(model, "get_params") and model.get_params().get("multi_class") == "deprecated":
            if cv == "prefit":
                model.set_params(multi_class="auto")
            else:
                model = clone(model)
                model.set_params(multi_class="auto")

        if cv == "prefit" and not hasattr(model, "classes_"):
            raise ValueError(
                "cv='prefit' requires a fitted model with `classes_`. "
                "Please pass a fitted LR model object or use cv=5 to refit during calibration."
            )

        # Standardize calibration features with the fitted scaler so the
        # calibrated model operates in the same feature space as self.model.
        cal_x = self._apply_standardizer(train_df[varlist])

        # sklearn 1.6 deprecated cv="prefit" in favour of wrapping the fitted
        # estimator in FrozenEstimator, and 1.8 removed "prefit" entirely. Use
        # FrozenEstimator when available, falling back to cv="prefit" on <1.6.
        estimator = model
        calib_kwargs = {"method": method}
        if cv == "prefit":
            try:
                from sklearn.frozen import FrozenEstimator
                estimator = FrozenEstimator(model)
            except ImportError:
                calib_kwargs["cv"] = "prefit"
        else:
            calib_kwargs["cv"] = cv

        # sklearn 1.2+ renamed base_estimator -> estimator; support both
        try:
            calibrated_model = CalibratedClassifierCV(estimator=estimator, **calib_kwargs)
        except TypeError:
            calibrated_model = CalibratedClassifierCV(base_estimator=estimator, **calib_kwargs)
        fit_weight = resolve_sample_weight(
            data=train_df,
            weight_col=weight_col,
            sample_weight=sample_weight,
            expected_len=len(train_df),
        )
        calibrated_model.fit(cal_x, train_df[self.tgt_name], sample_weight=fit_weight)

        self.calibrated_model = calibrated_model

        return self

    def eval_calibrated_outcome(self, evalset, plot=False, weight_col=None, sample_weight=None):
        """Evaluate calibrated vs raw probabilities on a holdout set."""
        from sklearn.calibration import calibration_curve
        from sklearn.metrics import brier_score_loss

        y_val = evalset[self.tgt_name]
        eval_weight = resolve_sample_weight(
            data=evalset,
            weight_col=weight_col,
            sample_weight=sample_weight,
            expected_len=len(evalset),
        )

        # 原始概率
        prob_raw = self.predict_proba(evalset)[:, 1]
        # 校准后概率(Platt Scaling)
        prob_cal = self.predict_proba(evalset, calibrated_model=True)[:, 1]

        # 1. Brier Score(越小越好)
        logger.info(
            "Raw Brier: %.6f",
            brier_score_loss(y_val, prob_raw, sample_weight=eval_weight),
        )
        logger.info(
            "Cal Brier: %.6f",
            brier_score_loss(y_val, prob_cal, sample_weight=eval_weight),
        )

        # 2. 可靠性曲线
        curve_kwargs = {} if eval_weight is None else {"sample_weight": eval_weight}
        try:
            fraction_of_positives_raw, mean_predicted_value_raw = calibration_curve(
                y_val, prob_raw, n_bins=10, **curve_kwargs
            )
            fraction_of_positives_cal, mean_predicted_value_cal = calibration_curve(
                y_val, prob_cal, n_bins=10, **curve_kwargs
            )
        except TypeError:
            fraction_of_positives_raw, mean_predicted_value_raw = calibration_curve(
                y_val, prob_raw, n_bins=10
            )
            fraction_of_positives_cal, mean_predicted_value_cal = calibration_curve(
                y_val, prob_cal, n_bins=10
            )

        if plot:
            import matplotlib.pyplot as plt
            plt.plot(mean_predicted_value_raw, fraction_of_positives_raw, 's-', label='Raw')
            plt.plot(mean_predicted_value_cal, fraction_of_positives_cal, 'o-', label='Platt')
            plt.plot([0,1], [0,1], 'k--', label='Perfect')
            plt.xlabel('Mean Predicted Probability')
            plt.ylabel('Fraction of Positives')
            plt.legend()
            plt.show()

    def predict(self, data, varlist=None, calibrated_model = False):
        """
        Predict using the trained model.

        When standardization is enabled, the input features are scaled with the
        scaler fitted during `fit` before being passed to the model.

        Parameters
        ----------
        data : pandas.DataFrame
            Input data for prediction
        varlist : list, optional
            Feature names (uses training features if None)

        Returns
        -------
        numpy.ndarray
            Predicted class labels
        """
        if varlist is None:
            varlist = self.varlist

        x = self._apply_standardizer(data[varlist])

        if calibrated_model:
            _patch_calibrated_model(self.calibrated_model)
            return self.calibrated_model.predict(x)

        return self.model.predict(x)

    def predict_proba(self, data, varlist=None, calibrated_model = False):
        """
        Predict class probabilities.

        When standardization is enabled, the input features are scaled with the
        scaler fitted during `fit` before being passed to the model.

        Parameters
        ----------
        data : pandas.DataFrame
            Input data for prediction
        varlist : list, optional
            Feature names (uses training features if None)

        Returns
        -------
        numpy.ndarray
            Array of shape (n_samples, 2) with class probabilities
        """
        if varlist is None:
            varlist = self.varlist

        x = self._apply_standardizer(data[varlist])

        if calibrated_model:
            _patch_calibrated_model(self.calibrated_model)
            return self.calibrated_model.predict_proba(x)

        return self.model.predict_proba(x)

    def get_variable_importance(self):
        """
        Get variable importance (coefficients) from the model.

        Returns
        -------
        pandas.DataFrame
            DataFrame with columns ['varlist', 'coef', 'importance'] sorted by
            importance in descending order

        Notes
        -----
        When standardization is enabled the coefficients are expressed in the
        standardized feature space (i.e. they are directly comparable in
        magnitude across features).
        """
        return lr_varimp(self.model)

    def get_statsmodel_summary(self, data=None, varlist=None, tgt_name=None):
        """
        Generate a statsmodels-style summary for the trained LR model.

        Parameters
        ----------
        data : pd.DataFrame, optional
            Data for computing the summary (uses stored training data if None)
        varlist : list of str, optional
            Feature names (uses stored varlist if None)
        tgt_name : str, optional
            Target variable name (uses stored tgt_name if None)

        Returns
        -------
        pandas.DataFrame
            Summary table with coefficients, standard errors, z-scores and p-values

        Notes
        -----
        When standardization is enabled the summary is computed on the
        standardized feature space, consistent with how the model was trained.
        """
        if data is None:
            data = self._data
        if varlist is None:
            varlist = self.varlist
        if tgt_name is None:
            tgt_name = self.tgt_name

        return get_lr_statsmodel_summary(
            self.model,
            self._apply_standardizer(data[varlist]),
            data[tgt_name],
            feature_names=varlist
        )

    def get_aic(self, data=None, varlist=None, tgt_name=None, weight_col=None):
        """
        Compute AIC for the trained model.

        Parameters
        ----------
        data : pd.DataFrame, optional
        varlist : list of str, optional
        tgt_name : str, optional

        Returns
        -------
        float
        """
        if data is None:
            data = self._data
        if varlist is None:
            varlist = self.varlist
        if tgt_name is None:
            tgt_name = self.tgt_name
        return compute_aic(
            self.model,
            self._apply_standardizer(data[varlist]),
            data[tgt_name],
            sample_weight=resolve_sample_weight(data=data, weight_col=weight_col, expected_len=len(data)),
        )

    def get_bic(self, data=None, varlist=None, tgt_name=None, weight_col=None):
        """
        Compute BIC for the trained model.

        Parameters
        ----------
        data : pd.DataFrame, optional
        varlist : list of str, optional
        tgt_name : str, optional

        Returns
        -------
        float
        """
        if data is None:
            data = self._data
        if varlist is None:
            varlist = self.varlist
        if tgt_name is None:
            tgt_name = self.tgt_name
        return compute_bic(
            self.model,
            self._apply_standardizer(data[varlist]),
            data[tgt_name],
            sample_weight=resolve_sample_weight(data=data, weight_col=weight_col, expected_len=len(data)),
        )

    def stepwise_selection(
        self,
        data,
        varlist,
        tgt_name,
        criterion='aic',
        direction='both',
        max_iter=100,
        verbose=True,
        weight_col=None,
    ):
        """
        Perform stepwise variable selection.

        Iteratively adds or removes features based on AIC/BIC improvement.

        When `standardize=True`, all interim fits and the final model are
        trained on standardized features, and the fitted scaler for the selected
        columns is stored on the instance for later prediction.

        Parameters
        ----------
        data : pd.DataFrame
            Training data
        varlist : list of str
            Initial feature list
        tgt_name : str
            Target variable name
        criterion : str, default 'aic'
            Selection criterion, 'aic' or 'bic'
        direction : str, default 'both'
            Direction of stepwise selection: 'forward', 'backward', or 'both'
        max_iter : int, default 100
            Maximum number of iterations
        verbose : bool, default True
            Whether to print progress

        Returns
        -------
        list of str
            Selected feature list
        """
        if criterion == 'aic':
            score_fn = lambda model, x, y: compute_aic(
                model, x, y, sample_weight=resolve_sample_weight(data=data, weight_col=weight_col, expected_len=len(data))
            )
        else:
            score_fn = lambda model, x, y: compute_bic(
                model, x, y, sample_weight=resolve_sample_weight(data=data, weight_col=weight_col, expected_len=len(data))
            )
        sample_weight = resolve_sample_weight(data=data, weight_col=weight_col, expected_len=len(data))

        # When standardizing, operate on a once-standardized feature frame.
        # Column-wise scalers (StandardScaler / MinMaxScaler) make slicing a
        # subset of columns equivalent to standardizing that subset.
        if self.standardize:
            interim_scaler = self._make_scaler()
            interim_scaler.fit(data[varlist])
            work = pd.DataFrame(
                interim_scaler.transform(data[varlist]),
                columns=list(varlist), index=data.index,
            )
        else:
            work = data

        current_vars = list(varlist) if direction != 'forward' else []
        remaining_vars = list(varlist) if direction == 'forward' else []

        best_model = lr_model(
            work[current_vars] if current_vars else pd.DataFrame(index=data.index),
            data[tgt_name], None, None, self.params, sample_weight=sample_weight
        ) if current_vars else None

        best_score = score_fn(best_model, work[current_vars], data[tgt_name]) if best_model else float('inf')

        for iteration in range(max_iter):
            improved = False

            # Forward step
            if direction in ('forward', 'both') and remaining_vars:
                scores = {}
                for var in remaining_vars:
                    trial_vars = current_vars + [var]
                    try:
                        model = lr_model(work[trial_vars], data[tgt_name], None, None, self.params, sample_weight=sample_weight)
                        scores[var] = score_fn(model, work[trial_vars], data[tgt_name])
                    except Exception:
                        continue
                if scores:
                    best_var = min(scores, key=scores.get)
                    if scores[best_var] < best_score:
                        current_vars.append(best_var)
                        remaining_vars.remove(best_var)
                        best_score = scores[best_var]
                        improved = True
                        if verbose:
                            logger.info(f"[Step {iteration+1}] ADD '{best_var}', {criterion.upper()}={best_score:.4f}")

            # Backward step
            if direction in ('backward', 'both') and len(current_vars) > 1:
                scores = {}
                for var in current_vars:
                    trial_vars = [v for v in current_vars if v != var]
                    try:
                        model = lr_model(work[trial_vars], data[tgt_name], None, None, self.params, sample_weight=sample_weight)
                        scores[var] = score_fn(model, work[trial_vars], data[tgt_name])
                    except Exception:
                        continue
                if scores:
                    worst_var = min(scores, key=scores.get)
                    if scores[worst_var] < best_score:
                        current_vars.remove(worst_var)
                        if direction == 'both':
                            remaining_vars.append(worst_var)
                        best_score = scores[worst_var]
                        improved = True
                        if verbose:
                            logger.info(f"[Step {iteration+1}] REMOVE '{worst_var}', {criterion.upper()}={best_score:.4f}")

            if not improved:
                break

        if verbose:
            logger.info(f"Stepwise selection complete. Selected {len(current_vars)} features.")

        self.varlist = current_vars
        self.tgt_name = tgt_name
        self._data = data

        if self.standardize:
            self.standardizer = self._make_scaler()
            self.standardizer.fit(data[current_vars])
            final_x = self._apply_standardizer(data[current_vars])
        else:
            self.standardizer = None
            final_x = data[current_vars]

        self.model = lr_model(final_x, data[tgt_name], None, None, self.params, sample_weight=sample_weight)
        return current_vars

    def grid_search_params(self, data, varlist, tgt_name, eval_sets, param_grid,
                           objective='oot_gap_penalized', primary_set=None,
                           gap_ref_sets=None, metric='auc', refit=True, verbose=True,
                           weight_col=None, eval_weight_col=None):
        """
        Grid-search LogisticRegression hyperparameters over a holdout-based objective.

        For every combination in ``param_grid`` (Cartesian product), a candidate model is
        trained on ``data`` and scored by AUC on each dataset in ``eval_sets``. The best
        combination is chosen by ``objective`` (default rewards a high primary-set AUC while
        penalizing the train/holdout AUC gap, i.e. overfitting). This is a **holdout** search
        (not k-fold CV), intended for the typical INS/OOS/OOT credit-scoring setup.

        When ``standardize=True`` on this instance, every candidate inherits the same
        standardization config (each candidate fits its own scaler on ``data``), so the
        search runs in the same feature space the final (optionally refit) model uses.

        Parameters
        ----------
        data : pandas.DataFrame
            Training dataset (e.g. the in-sample set used for fitting).
        varlist : list
            Feature column names.
        tgt_name : str
            Target column name.
        eval_sets : dict of {str: pandas.DataFrame}
            Ordered mapping of datasets to score by AUC, e.g.
            ``{'ins': ins_df, 'oos': oos_df, 'oot': oot_df}``.
        param_grid : dict of {str: iterable}
            Hyperparameter search space, e.g. ``{'C': np.logspace(-3, 2, 31)}``.
            Multiple keys are combined as a Cartesian product.
        objective : str or callable, default 'oot_gap_penalized'
            How to score each candidate from its per-set AUCs:

            - ``'oot_gap_penalized'`` : ``AUC[primary] - |mean(AUC[gap_refs]) - AUC[primary]|``
              (maximize the primary set while penalizing the overfitting gap).
            - ``'max_primary'`` : ``AUC[primary]``.
            - callable : ``f(auc_dict) -> float`` where ``auc_dict`` maps set name to AUC.
        primary_set : str, optional
            Key of ``eval_sets`` whose AUC to maximize. Defaults to the last key.
        gap_ref_sets : list of str, optional
            Set names whose mean AUC forms the gap reference. Defaults to all sets except
            ``primary_set``. Only used by ``'oot_gap_penalized'``.
        metric : str, default 'auc'
            Evaluation metric. Currently only ``'auc'`` is supported.
        refit : bool, default True
            If True, refit ``self`` on ``data`` with the best parameters after searching.
        verbose : bool, default True
            Print progress / best result.

        Returns
        -------
        pandas.DataFrame
            Search results sorted by ``score`` descending, with columns: the param name(s)
            + ``AUC_<name>`` per eval set + ``gap`` (gap objective only) + ``score``.

        Side Effects
        ------------
        Sets ``self.best_params_`` (dict) and ``self.search_results_`` (the returned table),
        and merges the best combo into ``self.params``; if ``refit=True``, also retrains
        ``self.model`` on ``data``.

        Examples
        --------
        >>> tuner = LRMaster(params={'C': 1.0, 'solver': 'lbfgs'})
        >>> res = tuner.grid_search_params(
        ...     data=ins_fit, varlist=woe_cols, tgt_name='bad_flag',
        ...     eval_sets={'ins': ins_woe, 'oos': oos_woe, 'oot': oot_woe},
        ...     param_grid={'C': np.logspace(-3, 2, 31)},
        ...     primary_set='oot', gap_ref_sets=['ins', 'oos'], refit=False,
        ... )
        >>> best_C = tuner.best_params_['C']
        """
        import itertools
        from sklearn.metrics import roc_auc_score

        if metric != 'auc':
            raise ValueError("Only metric='auc' is currently supported.")
        if not eval_sets:
            raise ValueError("eval_sets must be a non-empty {name: DataFrame} mapping.")

        set_names = list(eval_sets.keys())
        if primary_set is None:
            primary_set = set_names[-1]
        if primary_set not in eval_sets:
            raise ValueError("primary_set '{0}' not in eval_sets {1}".format(primary_set, set_names))
        if gap_ref_sets is None:
            gap_ref_sets = [n for n in set_names if n != primary_set]

        # Validate columns up-front for a clear error instead of a deep KeyError.
        missing = [c for c in (list(varlist) + [tgt_name]) if c not in data.columns]
        if missing:
            raise KeyError("training data missing columns: {0}".format(missing))
        for _name, _df in eval_sets.items():
            _miss = [c for c in (list(varlist) + [tgt_name]) if c not in _df.columns]
            if _miss:
                raise KeyError("eval set '{0}' missing columns: {1}".format(_name, _miss))

        param_names = list(param_grid.keys())
        combos = list(itertools.product(*[list(param_grid[k]) for k in param_names]))
        use_gap = (not callable(objective)) and objective == 'oot_gap_penalized' and len(gap_ref_sets) > 0

        if verbose:
            print("grid_search_params: {0} 组合 (params={1}), 训练集 {2:,} 行, eval={3}".format(
                len(combos), param_names, len(data), set_names))

        def _score(auc_dict):
            if callable(objective):
                return objective(auc_dict)
            if objective == 'max_primary':
                return auc_dict[primary_set]
            if objective == 'oot_gap_penalized':
                primary = auc_dict[primary_set]
                if gap_ref_sets:
                    ref = float(np.mean([auc_dict[n] for n in gap_ref_sets]))
                    return primary - abs(ref - primary)
                return primary
            raise ValueError("Unknown objective: {0}".format(objective))

        rows = []
        for combo in combos:
            combo_dict = dict(zip(param_names, combo))
            # Candidates inherit this instance's standardization config so the
            # search happens in the same feature space the final model uses.
            cand = LRMaster(
                params={**self.params, **combo_dict},
                standardize=self.standardize,
                scaler=self._scaler_proto,
            )
            cand.fit(data, varlist, tgt_name, weight_col=weight_col)

            auc_dict = {}
            for name, df_eval in eval_sets.items():
                proba = cand.predict_proba(df_eval, varlist)[:, 1]
                eval_sw = resolve_sample_weight(data=df_eval, weight_col=eval_weight_col, expected_len=len(df_eval))
                auc_dict[name] = roc_auc_score(df_eval[tgt_name], proba, sample_weight=eval_sw)

            row = dict(combo_dict)
            for name in set_names:
                row['AUC_{0}'.format(name)] = round(auc_dict[name], 5)
            if use_gap:
                ref = float(np.mean([auc_dict[n] for n in gap_ref_sets]))
                row['gap'] = round(ref - auc_dict[primary_set], 5)
            row['score'] = round(_score(auc_dict), 5)
            rows.append(row)

        search_df = pd.DataFrame(rows).sort_values('score', ascending=False).reset_index(drop=True)

        # Best params from the (unrounded) top row; cast numpy scalars to native
        # Python types for a clean repr / JSON-serializable params.
        best_row = search_df.iloc[0]

        def _native(v):
            return v.item() if hasattr(v, 'item') else v

        self.best_params_ = {k: _native(best_row[k]) for k in param_names}
        self.search_results_ = search_df
        self.params = {**self.params, **self.best_params_}

        # Round float param columns for display only (does not affect best_params_).
        for k in param_names:
            if pd.api.types.is_float_dtype(search_df[k]):
                search_df[k] = search_df[k].round(5)

        if verbose:
            print("★ best: {0} | score={1:.5f} | AUC_{2}={3:.5f}".format(
                self.best_params_, best_row['score'], primary_set,
                best_row['AUC_{0}'.format(primary_set)]))

        if refit:
            self.fit(data, varlist, tgt_name, weight_col=weight_col)

        return search_df

    def clone(self):
        """
        Create a copy of this LRMaster with the same parameters.

        The standardization configuration (`standardize` flag and scaler
        prototype) is carried over, but no fitted model or fitted scaler is
        copied.

        Returns
        -------
        LRMaster
            New instance with same params/standardization config but no fitted model
        """
        return LRMaster(
            params=dict(self.params),
            standardize=self.standardize,
            scaler=self._scaler_proto,
        )

set_data

set_data(data)

Store reference data for later use (e.g., calibration).

参数:

名称 类型 描述 默认
data DataFrame

Training data to store

必需

返回:

类型 描述
self
源代码位于: Modeling_Tool/Model/LRM_Tool.py
def set_data(self, data):
    """
    Store reference data for later use (e.g., calibration).

    Parameters
    ----------
    data : pd.DataFrame
        Training data to store

    Returns
    -------
    self
    """
    self._data = data
    return self

fit

fit(data, varlist, tgt_name, val_data=None, val_varlist=None, val_tgt_name=None, weight_col=None)

Train the logistic regression model.

When standardize=True, a scaler is fitted on the training features and stored as self.standardizer; the model is then trained on the scaled features. The same scaler is reused at prediction / evaluation time.

参数:

名称 类型 描述 默认
data DataFrame

Training dataset containing features and target

必需
varlist list of str

Feature column names to use for training

必需
tgt_name str

Target variable column name

必需
val_data DataFrame

Validation dataset (currently used for reference; not used in fitting)

None
val_varlist list of str

Validation feature column names

None
val_tgt_name str

Validation target variable column name

None
weight_col str

Column in data with per-sample training weights (non-negative). Mutually exclusive with passing sample_weight to lower-level helpers.

None

返回:

类型 描述
self
源代码位于: Modeling_Tool/Model/LRM_Tool.py
def fit(self, data, varlist, tgt_name, val_data=None, val_varlist=None, val_tgt_name=None, weight_col=None):
    """
    Train the logistic regression model.

    When `standardize=True`, a scaler is fitted on the training features and
    stored as `self.standardizer`; the model is then trained on the scaled
    features. The same scaler is reused at prediction / evaluation time.

    Parameters
    ----------
    data : pd.DataFrame
        Training dataset containing features and target
    varlist : list of str
        Feature column names to use for training
    tgt_name : str
        Target variable column name
    val_data : pd.DataFrame, optional
        Validation dataset (currently used for reference; not used in fitting)
    val_varlist : list of str, optional
        Validation feature column names
    val_tgt_name : str, optional
        Validation target variable column name
    weight_col : str, optional
        Column in ``data`` with per-sample training weights (non-negative).
        Mutually exclusive with passing ``sample_weight`` to lower-level helpers.

    Returns
    -------
    self
    """
    self.varlist = varlist
    self.tgt_name = tgt_name
    self._data = data

    train_x = data[varlist]
    if self.standardize:
        train_x = self._fit_standardizer(train_x)
    else:
        self.standardizer = None

    val_x = val_data[val_varlist] if val_data is not None and val_varlist is not None else None
    val_y = val_data[val_tgt_name] if val_data is not None and val_tgt_name is not None else None
    if val_x is not None:
        val_x = self._apply_standardizer(val_x)

    sample_weight = resolve_sample_weight(data=data, weight_col=weight_col, expected_len=len(data))
    self.model = lr_model(train_x, data[tgt_name], val_x, val_y, self.params, sample_weight=sample_weight)
    return self

calibrate_model

calibrate_model(model=None, train_df=None, method='sigmoid', cv=5, weight_col=None, sample_weight=None)

Model calibration with optional sample weights.

源代码位于: Modeling_Tool/Model/LRM_Tool.py
def calibrate_model(self, model=None, train_df=None, method='sigmoid', cv=5, weight_col=None, sample_weight=None):
    """Model calibration with optional sample weights."""
    from sklearn.calibration import CalibratedClassifierCV
    from sklearn.base import clone

    if train_df is None:
        train_df = self._data

    if model is None:
        model = self.model

    if hasattr(model, "feature_names_in_"):
        varlist = model.feature_names_in_.tolist()
    elif self.varlist is not None:
        varlist = self.varlist
    else:
        raise ValueError(
            "Cannot infer feature list from model. Please provide `varlist` when initializing LRMaster."
        )

    if hasattr(model, "get_params") and model.get_params().get("multi_class") == "deprecated":
        if cv == "prefit":
            model.set_params(multi_class="auto")
        else:
            model = clone(model)
            model.set_params(multi_class="auto")

    if cv == "prefit" and not hasattr(model, "classes_"):
        raise ValueError(
            "cv='prefit' requires a fitted model with `classes_`. "
            "Please pass a fitted LR model object or use cv=5 to refit during calibration."
        )

    # Standardize calibration features with the fitted scaler so the
    # calibrated model operates in the same feature space as self.model.
    cal_x = self._apply_standardizer(train_df[varlist])

    # sklearn 1.6 deprecated cv="prefit" in favour of wrapping the fitted
    # estimator in FrozenEstimator, and 1.8 removed "prefit" entirely. Use
    # FrozenEstimator when available, falling back to cv="prefit" on <1.6.
    estimator = model
    calib_kwargs = {"method": method}
    if cv == "prefit":
        try:
            from sklearn.frozen import FrozenEstimator
            estimator = FrozenEstimator(model)
        except ImportError:
            calib_kwargs["cv"] = "prefit"
    else:
        calib_kwargs["cv"] = cv

    # sklearn 1.2+ renamed base_estimator -> estimator; support both
    try:
        calibrated_model = CalibratedClassifierCV(estimator=estimator, **calib_kwargs)
    except TypeError:
        calibrated_model = CalibratedClassifierCV(base_estimator=estimator, **calib_kwargs)
    fit_weight = resolve_sample_weight(
        data=train_df,
        weight_col=weight_col,
        sample_weight=sample_weight,
        expected_len=len(train_df),
    )
    calibrated_model.fit(cal_x, train_df[self.tgt_name], sample_weight=fit_weight)

    self.calibrated_model = calibrated_model

    return self

eval_calibrated_outcome

eval_calibrated_outcome(evalset, plot=False, weight_col=None, sample_weight=None)

Evaluate calibrated vs raw probabilities on a holdout set.

源代码位于: Modeling_Tool/Model/LRM_Tool.py
def eval_calibrated_outcome(self, evalset, plot=False, weight_col=None, sample_weight=None):
    """Evaluate calibrated vs raw probabilities on a holdout set."""
    from sklearn.calibration import calibration_curve
    from sklearn.metrics import brier_score_loss

    y_val = evalset[self.tgt_name]
    eval_weight = resolve_sample_weight(
        data=evalset,
        weight_col=weight_col,
        sample_weight=sample_weight,
        expected_len=len(evalset),
    )

    # 原始概率
    prob_raw = self.predict_proba(evalset)[:, 1]
    # 校准后概率(Platt Scaling)
    prob_cal = self.predict_proba(evalset, calibrated_model=True)[:, 1]

    # 1. Brier Score(越小越好)
    logger.info(
        "Raw Brier: %.6f",
        brier_score_loss(y_val, prob_raw, sample_weight=eval_weight),
    )
    logger.info(
        "Cal Brier: %.6f",
        brier_score_loss(y_val, prob_cal, sample_weight=eval_weight),
    )

    # 2. 可靠性曲线
    curve_kwargs = {} if eval_weight is None else {"sample_weight": eval_weight}
    try:
        fraction_of_positives_raw, mean_predicted_value_raw = calibration_curve(
            y_val, prob_raw, n_bins=10, **curve_kwargs
        )
        fraction_of_positives_cal, mean_predicted_value_cal = calibration_curve(
            y_val, prob_cal, n_bins=10, **curve_kwargs
        )
    except TypeError:
        fraction_of_positives_raw, mean_predicted_value_raw = calibration_curve(
            y_val, prob_raw, n_bins=10
        )
        fraction_of_positives_cal, mean_predicted_value_cal = calibration_curve(
            y_val, prob_cal, n_bins=10
        )

    if plot:
        import matplotlib.pyplot as plt
        plt.plot(mean_predicted_value_raw, fraction_of_positives_raw, 's-', label='Raw')
        plt.plot(mean_predicted_value_cal, fraction_of_positives_cal, 'o-', label='Platt')
        plt.plot([0,1], [0,1], 'k--', label='Perfect')
        plt.xlabel('Mean Predicted Probability')
        plt.ylabel('Fraction of Positives')
        plt.legend()
        plt.show()

predict

predict(data, varlist=None, calibrated_model=False)

Predict using the trained model.

When standardization is enabled, the input features are scaled with the scaler fitted during fit before being passed to the model.

参数:

名称 类型 描述 默认
data DataFrame

Input data for prediction

必需
varlist list

Feature names (uses training features if None)

None

返回:

类型 描述
ndarray

Predicted class labels

源代码位于: Modeling_Tool/Model/LRM_Tool.py
def predict(self, data, varlist=None, calibrated_model = False):
    """
    Predict using the trained model.

    When standardization is enabled, the input features are scaled with the
    scaler fitted during `fit` before being passed to the model.

    Parameters
    ----------
    data : pandas.DataFrame
        Input data for prediction
    varlist : list, optional
        Feature names (uses training features if None)

    Returns
    -------
    numpy.ndarray
        Predicted class labels
    """
    if varlist is None:
        varlist = self.varlist

    x = self._apply_standardizer(data[varlist])

    if calibrated_model:
        _patch_calibrated_model(self.calibrated_model)
        return self.calibrated_model.predict(x)

    return self.model.predict(x)

predict_proba

predict_proba(data, varlist=None, calibrated_model=False)

Predict class probabilities.

When standardization is enabled, the input features are scaled with the scaler fitted during fit before being passed to the model.

参数:

名称 类型 描述 默认
data DataFrame

Input data for prediction

必需
varlist list

Feature names (uses training features if None)

None

返回:

类型 描述
ndarray

Array of shape (n_samples, 2) with class probabilities

源代码位于: Modeling_Tool/Model/LRM_Tool.py
def predict_proba(self, data, varlist=None, calibrated_model = False):
    """
    Predict class probabilities.

    When standardization is enabled, the input features are scaled with the
    scaler fitted during `fit` before being passed to the model.

    Parameters
    ----------
    data : pandas.DataFrame
        Input data for prediction
    varlist : list, optional
        Feature names (uses training features if None)

    Returns
    -------
    numpy.ndarray
        Array of shape (n_samples, 2) with class probabilities
    """
    if varlist is None:
        varlist = self.varlist

    x = self._apply_standardizer(data[varlist])

    if calibrated_model:
        _patch_calibrated_model(self.calibrated_model)
        return self.calibrated_model.predict_proba(x)

    return self.model.predict_proba(x)

get_variable_importance

get_variable_importance()

Get variable importance (coefficients) from the model.

返回:

类型 描述
DataFrame

DataFrame with columns ['varlist', 'coef', 'importance'] sorted by importance in descending order

Notes

When standardization is enabled the coefficients are expressed in the standardized feature space (i.e. they are directly comparable in magnitude across features).

源代码位于: Modeling_Tool/Model/LRM_Tool.py
def get_variable_importance(self):
    """
    Get variable importance (coefficients) from the model.

    Returns
    -------
    pandas.DataFrame
        DataFrame with columns ['varlist', 'coef', 'importance'] sorted by
        importance in descending order

    Notes
    -----
    When standardization is enabled the coefficients are expressed in the
    standardized feature space (i.e. they are directly comparable in
    magnitude across features).
    """
    return lr_varimp(self.model)

get_statsmodel_summary

get_statsmodel_summary(data=None, varlist=None, tgt_name=None)

Generate a statsmodels-style summary for the trained LR model.

参数:

名称 类型 描述 默认
data DataFrame

Data for computing the summary (uses stored training data if None)

None
varlist list of str

Feature names (uses stored varlist if None)

None
tgt_name str

Target variable name (uses stored tgt_name if None)

None

返回:

类型 描述
DataFrame

Summary table with coefficients, standard errors, z-scores and p-values

Notes

When standardization is enabled the summary is computed on the standardized feature space, consistent with how the model was trained.

源代码位于: Modeling_Tool/Model/LRM_Tool.py
def get_statsmodel_summary(self, data=None, varlist=None, tgt_name=None):
    """
    Generate a statsmodels-style summary for the trained LR model.

    Parameters
    ----------
    data : pd.DataFrame, optional
        Data for computing the summary (uses stored training data if None)
    varlist : list of str, optional
        Feature names (uses stored varlist if None)
    tgt_name : str, optional
        Target variable name (uses stored tgt_name if None)

    Returns
    -------
    pandas.DataFrame
        Summary table with coefficients, standard errors, z-scores and p-values

    Notes
    -----
    When standardization is enabled the summary is computed on the
    standardized feature space, consistent with how the model was trained.
    """
    if data is None:
        data = self._data
    if varlist is None:
        varlist = self.varlist
    if tgt_name is None:
        tgt_name = self.tgt_name

    return get_lr_statsmodel_summary(
        self.model,
        self._apply_standardizer(data[varlist]),
        data[tgt_name],
        feature_names=varlist
    )

get_aic

get_aic(data=None, varlist=None, tgt_name=None, weight_col=None)

Compute AIC for the trained model.

参数:

名称 类型 描述 默认
data DataFrame
None
varlist list of str
None
tgt_name str
None

返回:

类型 描述
float
源代码位于: Modeling_Tool/Model/LRM_Tool.py
def get_aic(self, data=None, varlist=None, tgt_name=None, weight_col=None):
    """
    Compute AIC for the trained model.

    Parameters
    ----------
    data : pd.DataFrame, optional
    varlist : list of str, optional
    tgt_name : str, optional

    Returns
    -------
    float
    """
    if data is None:
        data = self._data
    if varlist is None:
        varlist = self.varlist
    if tgt_name is None:
        tgt_name = self.tgt_name
    return compute_aic(
        self.model,
        self._apply_standardizer(data[varlist]),
        data[tgt_name],
        sample_weight=resolve_sample_weight(data=data, weight_col=weight_col, expected_len=len(data)),
    )

get_bic

get_bic(data=None, varlist=None, tgt_name=None, weight_col=None)

Compute BIC for the trained model.

参数:

名称 类型 描述 默认
data DataFrame
None
varlist list of str
None
tgt_name str
None

返回:

类型 描述
float
源代码位于: Modeling_Tool/Model/LRM_Tool.py
def get_bic(self, data=None, varlist=None, tgt_name=None, weight_col=None):
    """
    Compute BIC for the trained model.

    Parameters
    ----------
    data : pd.DataFrame, optional
    varlist : list of str, optional
    tgt_name : str, optional

    Returns
    -------
    float
    """
    if data is None:
        data = self._data
    if varlist is None:
        varlist = self.varlist
    if tgt_name is None:
        tgt_name = self.tgt_name
    return compute_bic(
        self.model,
        self._apply_standardizer(data[varlist]),
        data[tgt_name],
        sample_weight=resolve_sample_weight(data=data, weight_col=weight_col, expected_len=len(data)),
    )

stepwise_selection

stepwise_selection(data, varlist, tgt_name, criterion='aic', direction='both', max_iter=100, verbose=True, weight_col=None)

Perform stepwise variable selection.

Iteratively adds or removes features based on AIC/BIC improvement.

When standardize=True, all interim fits and the final model are trained on standardized features, and the fitted scaler for the selected columns is stored on the instance for later prediction.

参数:

名称 类型 描述 默认
data DataFrame

Training data

必需
varlist list of str

Initial feature list

必需
tgt_name str

Target variable name

必需
criterion str

Selection criterion, 'aic' or 'bic'

'aic'
direction str

Direction of stepwise selection: 'forward', 'backward', or 'both'

'both'
max_iter int

Maximum number of iterations

100
verbose bool

Whether to print progress

True

返回:

类型 描述
list of str

Selected feature list

源代码位于: Modeling_Tool/Model/LRM_Tool.py
def stepwise_selection(
    self,
    data,
    varlist,
    tgt_name,
    criterion='aic',
    direction='both',
    max_iter=100,
    verbose=True,
    weight_col=None,
):
    """
    Perform stepwise variable selection.

    Iteratively adds or removes features based on AIC/BIC improvement.

    When `standardize=True`, all interim fits and the final model are
    trained on standardized features, and the fitted scaler for the selected
    columns is stored on the instance for later prediction.

    Parameters
    ----------
    data : pd.DataFrame
        Training data
    varlist : list of str
        Initial feature list
    tgt_name : str
        Target variable name
    criterion : str, default 'aic'
        Selection criterion, 'aic' or 'bic'
    direction : str, default 'both'
        Direction of stepwise selection: 'forward', 'backward', or 'both'
    max_iter : int, default 100
        Maximum number of iterations
    verbose : bool, default True
        Whether to print progress

    Returns
    -------
    list of str
        Selected feature list
    """
    if criterion == 'aic':
        score_fn = lambda model, x, y: compute_aic(
            model, x, y, sample_weight=resolve_sample_weight(data=data, weight_col=weight_col, expected_len=len(data))
        )
    else:
        score_fn = lambda model, x, y: compute_bic(
            model, x, y, sample_weight=resolve_sample_weight(data=data, weight_col=weight_col, expected_len=len(data))
        )
    sample_weight = resolve_sample_weight(data=data, weight_col=weight_col, expected_len=len(data))

    # When standardizing, operate on a once-standardized feature frame.
    # Column-wise scalers (StandardScaler / MinMaxScaler) make slicing a
    # subset of columns equivalent to standardizing that subset.
    if self.standardize:
        interim_scaler = self._make_scaler()
        interim_scaler.fit(data[varlist])
        work = pd.DataFrame(
            interim_scaler.transform(data[varlist]),
            columns=list(varlist), index=data.index,
        )
    else:
        work = data

    current_vars = list(varlist) if direction != 'forward' else []
    remaining_vars = list(varlist) if direction == 'forward' else []

    best_model = lr_model(
        work[current_vars] if current_vars else pd.DataFrame(index=data.index),
        data[tgt_name], None, None, self.params, sample_weight=sample_weight
    ) if current_vars else None

    best_score = score_fn(best_model, work[current_vars], data[tgt_name]) if best_model else float('inf')

    for iteration in range(max_iter):
        improved = False

        # Forward step
        if direction in ('forward', 'both') and remaining_vars:
            scores = {}
            for var in remaining_vars:
                trial_vars = current_vars + [var]
                try:
                    model = lr_model(work[trial_vars], data[tgt_name], None, None, self.params, sample_weight=sample_weight)
                    scores[var] = score_fn(model, work[trial_vars], data[tgt_name])
                except Exception:
                    continue
            if scores:
                best_var = min(scores, key=scores.get)
                if scores[best_var] < best_score:
                    current_vars.append(best_var)
                    remaining_vars.remove(best_var)
                    best_score = scores[best_var]
                    improved = True
                    if verbose:
                        logger.info(f"[Step {iteration+1}] ADD '{best_var}', {criterion.upper()}={best_score:.4f}")

        # Backward step
        if direction in ('backward', 'both') and len(current_vars) > 1:
            scores = {}
            for var in current_vars:
                trial_vars = [v for v in current_vars if v != var]
                try:
                    model = lr_model(work[trial_vars], data[tgt_name], None, None, self.params, sample_weight=sample_weight)
                    scores[var] = score_fn(model, work[trial_vars], data[tgt_name])
                except Exception:
                    continue
            if scores:
                worst_var = min(scores, key=scores.get)
                if scores[worst_var] < best_score:
                    current_vars.remove(worst_var)
                    if direction == 'both':
                        remaining_vars.append(worst_var)
                    best_score = scores[worst_var]
                    improved = True
                    if verbose:
                        logger.info(f"[Step {iteration+1}] REMOVE '{worst_var}', {criterion.upper()}={best_score:.4f}")

        if not improved:
            break

    if verbose:
        logger.info(f"Stepwise selection complete. Selected {len(current_vars)} features.")

    self.varlist = current_vars
    self.tgt_name = tgt_name
    self._data = data

    if self.standardize:
        self.standardizer = self._make_scaler()
        self.standardizer.fit(data[current_vars])
        final_x = self._apply_standardizer(data[current_vars])
    else:
        self.standardizer = None
        final_x = data[current_vars]

    self.model = lr_model(final_x, data[tgt_name], None, None, self.params, sample_weight=sample_weight)
    return current_vars

grid_search_params

grid_search_params(data, varlist, tgt_name, eval_sets, param_grid, objective='oot_gap_penalized', primary_set=None, gap_ref_sets=None, metric='auc', refit=True, verbose=True, weight_col=None, eval_weight_col=None)

Grid-search LogisticRegression hyperparameters over a holdout-based objective.

For every combination in param_grid (Cartesian product), a candidate model is trained on data and scored by AUC on each dataset in eval_sets. The best combination is chosen by objective (default rewards a high primary-set AUC while penalizing the train/holdout AUC gap, i.e. overfitting). This is a holdout search (not k-fold CV), intended for the typical INS/OOS/OOT credit-scoring setup.

When standardize=True on this instance, every candidate inherits the same standardization config (each candidate fits its own scaler on data), so the search runs in the same feature space the final (optionally refit) model uses.

参数:

名称 类型 描述 默认
data DataFrame

Training dataset (e.g. the in-sample set used for fitting).

必需
varlist list

Feature column names.

必需
tgt_name str

Target column name.

必需
eval_sets dict of {str: pandas.DataFrame}

Ordered mapping of datasets to score by AUC, e.g. {'ins': ins_df, 'oos': oos_df, 'oot': oot_df}.

必需
param_grid dict of {str: iterable}

Hyperparameter search space, e.g. {'C': np.logspace(-3, 2, 31)}. Multiple keys are combined as a Cartesian product.

必需
objective str or callable

How to score each candidate from its per-set AUCs:

  • 'oot_gap_penalized' : AUC[primary] - |mean(AUC[gap_refs]) - AUC[primary]| (maximize the primary set while penalizing the overfitting gap).
  • 'max_primary' : AUC[primary].
  • callable : f(auc_dict) -> float where auc_dict maps set name to AUC.
'oot_gap_penalized'
primary_set str

Key of eval_sets whose AUC to maximize. Defaults to the last key.

None
gap_ref_sets list of str

Set names whose mean AUC forms the gap reference. Defaults to all sets except primary_set. Only used by 'oot_gap_penalized'.

None
metric str

Evaluation metric. Currently only 'auc' is supported.

'auc'
refit bool

If True, refit self on data with the best parameters after searching.

True
verbose bool

Print progress / best result.

True

返回:

类型 描述
DataFrame

Search results sorted by score descending, with columns: the param name(s) + AUC_<name> per eval set + gap (gap objective only) + score.

Side Effects

Sets self.best_params_ (dict) and self.search_results_ (the returned table), and merges the best combo into self.params; if refit=True, also retrains self.model on data.

示例:

>>> tuner = LRMaster(params={'C': 1.0, 'solver': 'lbfgs'})
>>> res = tuner.grid_search_params(
...     data=ins_fit, varlist=woe_cols, tgt_name='bad_flag',
...     eval_sets={'ins': ins_woe, 'oos': oos_woe, 'oot': oot_woe},
...     param_grid={'C': np.logspace(-3, 2, 31)},
...     primary_set='oot', gap_ref_sets=['ins', 'oos'], refit=False,
... )
>>> best_C = tuner.best_params_['C']
源代码位于: Modeling_Tool/Model/LRM_Tool.py
def grid_search_params(self, data, varlist, tgt_name, eval_sets, param_grid,
                       objective='oot_gap_penalized', primary_set=None,
                       gap_ref_sets=None, metric='auc', refit=True, verbose=True,
                       weight_col=None, eval_weight_col=None):
    """
    Grid-search LogisticRegression hyperparameters over a holdout-based objective.

    For every combination in ``param_grid`` (Cartesian product), a candidate model is
    trained on ``data`` and scored by AUC on each dataset in ``eval_sets``. The best
    combination is chosen by ``objective`` (default rewards a high primary-set AUC while
    penalizing the train/holdout AUC gap, i.e. overfitting). This is a **holdout** search
    (not k-fold CV), intended for the typical INS/OOS/OOT credit-scoring setup.

    When ``standardize=True`` on this instance, every candidate inherits the same
    standardization config (each candidate fits its own scaler on ``data``), so the
    search runs in the same feature space the final (optionally refit) model uses.

    Parameters
    ----------
    data : pandas.DataFrame
        Training dataset (e.g. the in-sample set used for fitting).
    varlist : list
        Feature column names.
    tgt_name : str
        Target column name.
    eval_sets : dict of {str: pandas.DataFrame}
        Ordered mapping of datasets to score by AUC, e.g.
        ``{'ins': ins_df, 'oos': oos_df, 'oot': oot_df}``.
    param_grid : dict of {str: iterable}
        Hyperparameter search space, e.g. ``{'C': np.logspace(-3, 2, 31)}``.
        Multiple keys are combined as a Cartesian product.
    objective : str or callable, default 'oot_gap_penalized'
        How to score each candidate from its per-set AUCs:

        - ``'oot_gap_penalized'`` : ``AUC[primary] - |mean(AUC[gap_refs]) - AUC[primary]|``
          (maximize the primary set while penalizing the overfitting gap).
        - ``'max_primary'`` : ``AUC[primary]``.
        - callable : ``f(auc_dict) -> float`` where ``auc_dict`` maps set name to AUC.
    primary_set : str, optional
        Key of ``eval_sets`` whose AUC to maximize. Defaults to the last key.
    gap_ref_sets : list of str, optional
        Set names whose mean AUC forms the gap reference. Defaults to all sets except
        ``primary_set``. Only used by ``'oot_gap_penalized'``.
    metric : str, default 'auc'
        Evaluation metric. Currently only ``'auc'`` is supported.
    refit : bool, default True
        If True, refit ``self`` on ``data`` with the best parameters after searching.
    verbose : bool, default True
        Print progress / best result.

    Returns
    -------
    pandas.DataFrame
        Search results sorted by ``score`` descending, with columns: the param name(s)
        + ``AUC_<name>`` per eval set + ``gap`` (gap objective only) + ``score``.

    Side Effects
    ------------
    Sets ``self.best_params_`` (dict) and ``self.search_results_`` (the returned table),
    and merges the best combo into ``self.params``; if ``refit=True``, also retrains
    ``self.model`` on ``data``.

    Examples
    --------
    >>> tuner = LRMaster(params={'C': 1.0, 'solver': 'lbfgs'})
    >>> res = tuner.grid_search_params(
    ...     data=ins_fit, varlist=woe_cols, tgt_name='bad_flag',
    ...     eval_sets={'ins': ins_woe, 'oos': oos_woe, 'oot': oot_woe},
    ...     param_grid={'C': np.logspace(-3, 2, 31)},
    ...     primary_set='oot', gap_ref_sets=['ins', 'oos'], refit=False,
    ... )
    >>> best_C = tuner.best_params_['C']
    """
    import itertools
    from sklearn.metrics import roc_auc_score

    if metric != 'auc':
        raise ValueError("Only metric='auc' is currently supported.")
    if not eval_sets:
        raise ValueError("eval_sets must be a non-empty {name: DataFrame} mapping.")

    set_names = list(eval_sets.keys())
    if primary_set is None:
        primary_set = set_names[-1]
    if primary_set not in eval_sets:
        raise ValueError("primary_set '{0}' not in eval_sets {1}".format(primary_set, set_names))
    if gap_ref_sets is None:
        gap_ref_sets = [n for n in set_names if n != primary_set]

    # Validate columns up-front for a clear error instead of a deep KeyError.
    missing = [c for c in (list(varlist) + [tgt_name]) if c not in data.columns]
    if missing:
        raise KeyError("training data missing columns: {0}".format(missing))
    for _name, _df in eval_sets.items():
        _miss = [c for c in (list(varlist) + [tgt_name]) if c not in _df.columns]
        if _miss:
            raise KeyError("eval set '{0}' missing columns: {1}".format(_name, _miss))

    param_names = list(param_grid.keys())
    combos = list(itertools.product(*[list(param_grid[k]) for k in param_names]))
    use_gap = (not callable(objective)) and objective == 'oot_gap_penalized' and len(gap_ref_sets) > 0

    if verbose:
        print("grid_search_params: {0} 组合 (params={1}), 训练集 {2:,} 行, eval={3}".format(
            len(combos), param_names, len(data), set_names))

    def _score(auc_dict):
        if callable(objective):
            return objective(auc_dict)
        if objective == 'max_primary':
            return auc_dict[primary_set]
        if objective == 'oot_gap_penalized':
            primary = auc_dict[primary_set]
            if gap_ref_sets:
                ref = float(np.mean([auc_dict[n] for n in gap_ref_sets]))
                return primary - abs(ref - primary)
            return primary
        raise ValueError("Unknown objective: {0}".format(objective))

    rows = []
    for combo in combos:
        combo_dict = dict(zip(param_names, combo))
        # Candidates inherit this instance's standardization config so the
        # search happens in the same feature space the final model uses.
        cand = LRMaster(
            params={**self.params, **combo_dict},
            standardize=self.standardize,
            scaler=self._scaler_proto,
        )
        cand.fit(data, varlist, tgt_name, weight_col=weight_col)

        auc_dict = {}
        for name, df_eval in eval_sets.items():
            proba = cand.predict_proba(df_eval, varlist)[:, 1]
            eval_sw = resolve_sample_weight(data=df_eval, weight_col=eval_weight_col, expected_len=len(df_eval))
            auc_dict[name] = roc_auc_score(df_eval[tgt_name], proba, sample_weight=eval_sw)

        row = dict(combo_dict)
        for name in set_names:
            row['AUC_{0}'.format(name)] = round(auc_dict[name], 5)
        if use_gap:
            ref = float(np.mean([auc_dict[n] for n in gap_ref_sets]))
            row['gap'] = round(ref - auc_dict[primary_set], 5)
        row['score'] = round(_score(auc_dict), 5)
        rows.append(row)

    search_df = pd.DataFrame(rows).sort_values('score', ascending=False).reset_index(drop=True)

    # Best params from the (unrounded) top row; cast numpy scalars to native
    # Python types for a clean repr / JSON-serializable params.
    best_row = search_df.iloc[0]

    def _native(v):
        return v.item() if hasattr(v, 'item') else v

    self.best_params_ = {k: _native(best_row[k]) for k in param_names}
    self.search_results_ = search_df
    self.params = {**self.params, **self.best_params_}

    # Round float param columns for display only (does not affect best_params_).
    for k in param_names:
        if pd.api.types.is_float_dtype(search_df[k]):
            search_df[k] = search_df[k].round(5)

    if verbose:
        print("★ best: {0} | score={1:.5f} | AUC_{2}={3:.5f}".format(
            self.best_params_, best_row['score'], primary_set,
            best_row['AUC_{0}'.format(primary_set)]))

    if refit:
        self.fit(data, varlist, tgt_name, weight_col=weight_col)

    return search_df

clone

clone()

Create a copy of this LRMaster with the same parameters.

The standardization configuration (standardize flag and scaler prototype) is carried over, but no fitted model or fitted scaler is copied.

返回:

类型 描述
LRMaster

New instance with same params/standardization config but no fitted model

源代码位于: Modeling_Tool/Model/LRM_Tool.py
def clone(self):
    """
    Create a copy of this LRMaster with the same parameters.

    The standardization configuration (`standardize` flag and scaler
    prototype) is carried over, but no fitted model or fitted scaler is
    copied.

    Returns
    -------
    LRMaster
        New instance with same params/standardization config but no fitted model
    """
    return LRMaster(
        params=dict(self.params),
        standardize=self.standardize,
        scaler=self._scaler_proto,
    )

lr_model

lr_model(mdlx, mdly, valx, valy, params_dict, sample_weight=None)

Train a Logistic Regression model.

参数:

名称 类型 描述 默认
mdlx DataFrame or ndarray

Training feature matrix

必需
mdly Series or ndarray

Training target variable

必需
valx DataFrame or ndarray

Validation feature matrix (used for reference only)

必需
valy Series or ndarray

Validation target variable (used for reference only)

必需
params_dict dict

Dictionary of parameters for LogisticRegression

必需
sample_weight array - like

Per-sample weights passed to LogisticRegression.fit.

None

返回:

类型 描述
LogisticRegression

Trained logistic regression model

源代码位于: Modeling_Tool/Model/LRM_Tool.py
def lr_model(mdlx, mdly, valx, valy, params_dict, sample_weight=None):
    """
    Train a Logistic Regression model.

    Parameters
    ----------
    mdlx : pandas.DataFrame or numpy.ndarray
        Training feature matrix
    mdly : pandas.Series or numpy.ndarray
        Training target variable
    valx : pandas.DataFrame or numpy.ndarray
        Validation feature matrix (used for reference only)
    valy : pandas.Series or numpy.ndarray
        Validation target variable (used for reference only)
    params_dict : dict
        Dictionary of parameters for LogisticRegression
    sample_weight : array-like, optional
        Per-sample weights passed to ``LogisticRegression.fit``.

    Returns
    -------
    sklearn.linear_model.LogisticRegression
        Trained logistic regression model
    """
    params_dict = _sanitize_lr_params(params_dict)
    model = LogisticRegression(**params_dict)
    model.fit(mdlx, mdly, sample_weight=sample_weight)
    return model

lr_varimp

lr_varimp(model)

Get variable importance from a Logistic Regression model.

Computes the absolute value of the model coefficients as a measure of variable importance.

参数:

名称 类型 描述 默认
model LogisticRegression

Trained logistic regression model

必需

返回:

类型 描述
DataFrame

DataFrame with columns ['varlist', 'coef', 'importance'] sorted by importance in descending order

源代码位于: Modeling_Tool/Model/LRM_Tool.py
def lr_varimp(model):
    """
    Get variable importance from a Logistic Regression model.

    Computes the absolute value of the model coefficients as a measure of
    variable importance.

    Parameters
    ----------
    model : sklearn.linear_model.LogisticRegression
        Trained logistic regression model

    Returns
    -------
    pandas.DataFrame
        DataFrame with columns ['varlist', 'coef', 'importance'] sorted by
        importance in descending order
    """
    if hasattr(model, 'feature_names_in_'):
        varnames = model.feature_names_in_.tolist()
    else:
        varnames = [f'x{i}' for i in range(len(model.coef_[0]))]

    varimp_df = pd.DataFrame({
        'varlist': varnames,
        'coef': model.coef_[0],
        'importance': np.abs(model.coef_[0])
    })
    return varimp_df.sort_values('importance', ascending=False).reset_index(drop=True)

get_lr_statsmodel_summary

get_lr_statsmodel_summary(model, x, y, feature_names=None)

Generate a statsmodels-style summary for a sklearn LogisticRegression model.

Computes standard errors, z-scores, p-values, and confidence intervals for the logistic regression coefficients using the observed Fisher information matrix.

参数:

名称 类型 描述 默认
model LogisticRegression

Trained logistic regression model

必需
x DataFrame or ndarray

Feature matrix used for training

必需
y Series or ndarray

Target variable used for training

必需
feature_names list of str

Feature names (inferred from x if not provided)

None

返回:

类型 描述
DataFrame

Summary table with columns: ['coef', 'std_err', 'z', 'p_value', 'ci_lower', 'ci_upper']

源代码位于: Modeling_Tool/Model/LRM_Tool.py
def get_lr_statsmodel_summary(model, x, y, feature_names=None):
    """
    Generate a statsmodels-style summary for a sklearn LogisticRegression model.

    Computes standard errors, z-scores, p-values, and confidence intervals
    for the logistic regression coefficients using the observed Fisher information
    matrix.

    Parameters
    ----------
    model : sklearn.linear_model.LogisticRegression
        Trained logistic regression model
    x : pandas.DataFrame or numpy.ndarray
        Feature matrix used for training
    y : pandas.Series or numpy.ndarray
        Target variable used for training
    feature_names : list of str, optional
        Feature names (inferred from x if not provided)

    Returns
    -------
    pandas.DataFrame
        Summary table with columns: ['coef', 'std_err', 'z', 'p_value',
        'ci_lower', 'ci_upper']
    """
    from scipy import stats

    if feature_names is None:
        if hasattr(x, 'columns'):
            feature_names = x.columns.tolist()
        elif hasattr(model, 'feature_names_in_'):
            feature_names = model.feature_names_in_.tolist()
        else:
            feature_names = [f'x{i}' for i in range(x.shape[1])]

    x_arr = x.values if hasattr(x, 'values') else np.array(x)
    y_arr = y.values if hasattr(y, 'values') else np.array(y)

    prob = model.predict_proba(x_arr)[:, 1]
    w = prob * (1 - prob)
    W = np.diag(w)
    X_design = np.hstack([np.ones((x_arr.shape[0], 1)), x_arr])

    try:
        cov_matrix = np.linalg.inv(X_design.T @ W @ X_design)
    except np.linalg.LinAlgError:
        cov_matrix = np.linalg.pinv(X_design.T @ W @ X_design)

    intercept = model.intercept_[0]
    coefs = model.coef_[0]
    all_coefs = np.concatenate([[intercept], coefs])

    std_errs = np.sqrt(np.diag(cov_matrix))
    z_scores = all_coefs / std_errs
    p_values = 2 * (1 - stats.norm.cdf(np.abs(z_scores)))
    ci_lower = all_coefs - 1.96 * std_errs
    ci_upper = all_coefs + 1.96 * std_errs

    all_names = ['Intercept'] + feature_names

    summary_df = pd.DataFrame({
        'coef': all_coefs,
        'std_err': std_errs,
        'z': z_scores,
        'p_value': p_values,
        'ci_lower': ci_lower,
        'ci_upper': ci_upper
    }, index=all_names)

    return summary_df

compute_aic

compute_aic(model, x, y, sample_weight=None)

Compute AIC (Akaike Information Criterion) for a logistic regression model.

参数:

名称 类型 描述 默认
model LogisticRegression

Fitted logistic regression model

必需
x DataFrame or ndarray

Feature matrix

必需
y Series or ndarray

Target variable

必需
sample_weight array - like

Per-sample weights for log-likelihood / information criteria.

None

返回:

类型 描述
float

AIC value (lower is better)

源代码位于: Modeling_Tool/Model/LRM_Tool.py
def compute_aic(model, x, y, sample_weight=None):
    """
    Compute AIC (Akaike Information Criterion) for a logistic regression model.

    Parameters
    ----------
    model : sklearn.linear_model.LogisticRegression
        Fitted logistic regression model
    x : pandas.DataFrame or numpy.ndarray
        Feature matrix
    y : pandas.Series or numpy.ndarray
        Target variable
    sample_weight : array-like, optional
        Per-sample weights for log-likelihood / information criteria.

    Returns
    -------
    float
        AIC value (lower is better)
    """
    log_likelihood = _compute_log_likelihood(model, x, y, sample_weight=sample_weight)
    k = model.coef_.shape[1] + 1  # number of params including intercept
    aic = 2 * k - 2 * log_likelihood
    return aic

compute_bic

compute_bic(model, x, y, sample_weight=None)

Compute BIC (Bayesian Information Criterion) for a logistic regression model.

参数:

名称 类型 描述 默认
model LogisticRegression

Fitted logistic regression model

必需
x DataFrame or ndarray

Feature matrix

必需
y Series or ndarray

Target variable

必需
sample_weight array - like

Per-sample weights for log-likelihood / information criteria.

None

返回:

类型 描述
float

BIC value (lower is better)

源代码位于: Modeling_Tool/Model/LRM_Tool.py
def compute_bic(model, x, y, sample_weight=None):
    """
    Compute BIC (Bayesian Information Criterion) for a logistic regression model.

    Parameters
    ----------
    model : sklearn.linear_model.LogisticRegression
        Fitted logistic regression model
    x : pandas.DataFrame or numpy.ndarray
        Feature matrix
    y : pandas.Series or numpy.ndarray
        Target variable
    sample_weight : array-like, optional
        Per-sample weights for log-likelihood / information criteria.

    Returns
    -------
    float
        BIC value (lower is better)
    """
    x_arr = x.values if hasattr(x, 'values') else np.array(x)
    weight = None if sample_weight is None else np.asarray(sample_weight, dtype=float)
    log_likelihood = _compute_log_likelihood(model, x, y, sample_weight=weight)
    k = model.coef_.shape[1] + 1
    n = float(np.sum(weight)) if weight is not None else x_arr.shape[0]
    bic = k * np.log(n) - 2 * log_likelihood
    return bic

梯度提升模型 — GBM_Tool

GBM_Tool

梯度提升模型训练工具包

本模块提供LightGBM、XGBoost和CatBoost模型的快速训练和评估功能, 包括模型训练、特征重要性提取等常用操作。

函数:

名称 描述
set_num_leaves

根据最大深度计算叶子节点数,避免过拟合。

lgb_model

快速训练LightGBM模型。

lgb_varimp

获取LightGBM特征重要性。

lgbm_quick_train

快速训练LightGBM模型(使用DataFrame接口)。

xgb_model

训练XGBoost模型。

xgb_varimp

获取XGBoost特征重要性。

xgbm_quick_train

快速训练XGBoost模型(使用DataFrame接口)。

catboost_model

训练CatBoost模型。

catboost_varimp

获取CatBoost特征重要性。

catboost_quick_train

快速训练CatBoost模型(使用DataFrame接口)。

类:

名称 描述
LightGBMModel

LightGBM模型封装类,提供统一的训练和评估接口。

XGBoostModel

XGBoost模型封装类,提供统一的训练和评估接口。

CatBoostModel

CatBoost模型封装类,提供统一的训练和评估接口。

GradientBoostingModel

统一封装类,支持LightGBM、XGBoost和CatBoost切换。

示例:

函数式调用

>>> model = lgb_model(x_train, y_train, x_val, y_val, params)
>>> varimp = lgb_varimp(model)

类封装调用

>>> lgb_model = LightGBMModel(params)
>>> lgb_model.fit(x_train, y_train, x_val, y_val)
>>> varimp = lgb_model.get_feature_importance()

统一接口调用

>>> model = GradientBoostingModel('lgb', params)
>>> model.fit(x_train, y_train, x_val, y_val)

LightGBMModel

LightGBM模型封装类。

提供统一的LightGBM模型训练、预测、保存和加载接口。 支持模型校准、特征重要性获取等功能。

参数:

名称 类型 描述 默认
params dict

LightGBM模型参数字典

必需
model LGBMClassifier

预加载的模型实例

None

示例:

>>> lgb_clf = LightGBMModel(params)
>>> lgb_clf.fit(x_train, y_train, x_val, y_val)
>>> preds = lgb_clf.predict(x_test)
源代码位于: Modeling_Tool/Model/GBM_Tool.py
class LightGBMModel:
    """
    LightGBM模型封装类。

    提供统一的LightGBM模型训练、预测、保存和加载接口。
    支持模型校准、特征重要性获取等功能。

    Parameters
    ----------
    params : dict
        LightGBM模型参数字典
    model : lgb.LGBMClassifier, optional
        预加载的模型实例

    Examples
    --------
    >>> lgb_clf = LightGBMModel(params)
    >>> lgb_clf.fit(x_train, y_train, x_val, y_val)
    >>> preds = lgb_clf.predict(x_test)
    """

    def __init__(self, params, model=None):
        """
        初始化LightGBM模型封装类。

        Parameters
        ----------
        params : dict
            LightGBM模型参数字典
        model : lgb.LGBMClassifier, optional
            预加载的模型实例
        """
        lgb = _get_lgb()
        self.params = params
        self.model = model
        self.feature_names_ = None

    def fit(self, x, y, valx, valy, wgt=None, init_score=None, sample_weight=None, eval_sample_weight=None):
        """训练LightGBM模型。

        使用训练集和验证集训练模型,支持早停机制。

        Parameters
        ----------
        x : array-like or pd.DataFrame
            训练集特征
        y : array-like
            训练集标签
        valx : array-like or pd.DataFrame
            验证集特征
        valy : array-like
            验证集标签
        wgt : array-like, optional
            样本权重
        init_score : array-like, optional
            初始化分数

        Returns
        -------
        self
        """
        if wgt is None:
            wgt = sample_weight
        self.model = lgb_model(
            x=x, y=y, valx=valx, valy=valy,
            params_dict=self.params, wgt=wgt, init_score=init_score,
            eval_sample_weight=eval_sample_weight
        )
        if hasattr(x, 'columns'):
            self.feature_names_ = list(x.columns)
        return self

    def predict(self, x):
        """预测样本的概率。

        Parameters
        ----------
        x : array-like or pd.DataFrame
            预测特征

        Returns
        -------
        np.ndarray
            预测概率
        """
        return self.model.predict_proba(x)[:, 1]

    def get_feature_importance(self, importance_type='gain'):
        """获取特征重要性。

        Parameters
        ----------
        importance_type : str, default 'gain'
            特征重要性类型,可选 'gain' 或 'split'

        Returns
        -------
        pd.DataFrame
            包含 feature 和 importance 列的DataFrame
        """
        return lgb_varimp(self.model)

    def save(self, path):
        """保存模型。

        Parameters
        ----------
        path : str
            模型保存路径
        """
        save_model(self.model, path)

    def load(self, path):
        """加载模型。

        Parameters
        ----------
        path : str
            模型文件路径

        Returns
        -------
        self
        """
        self.model = load_model(path)
        return self

    def calibrate(self, x, y, method='sigmoid', cv='prefit'):
        """模型概率校准。

        Parameters
        ----------
        x : array-like
            校准特征
        y : array-like
            校准标签
        method : str, default 'sigmoid'
            校准方法,'sigmoid' 或 'isotonic'
        cv : str or int, default 'prefit'
            交叉验证方式

        Returns
        -------
        self
        """
        self.model = CalibratedClassifierCV(self.model, method=method, cv=cv)
        self.model.fit(x, y)
        return self

    def calibration_curve(self, x, y, n_bins=10):
        """获取校准曲线数据。

        Parameters
        ----------
        x : array-like
            特征
        y : array-like
            标签
        n_bins : int, default 10
            分箱数

        Returns
        -------
        tuple
            (fraction_of_positives, mean_predicted_value)
        """
        y_prob = self.predict(x)
        return calibration_curve(y, y_prob, n_bins=n_bins)

    def brier_score(self, x, y):
        """计算Brier分数。

        Parameters
        ----------
        x : array-like
            特征
        y : array-like
            标签

        Returns
        -------
        float
            Brier分数
        """
        y_prob = self.predict(x)
        return brier_score_loss(y, y_prob)

    def roc_auc(self, x, y):
        """计算ROC AUC。

        Parameters
        ----------
        x : array-like
            特征
        y : array-like
            标签

        Returns
        -------
        float
            ROC AUC分数
        """
        y_prob = self.predict(x)
        return roc_auc_score(y, y_prob)

fit

fit(x, y, valx, valy, wgt=None, init_score=None, sample_weight=None, eval_sample_weight=None)

训练LightGBM模型。

使用训练集和验证集训练模型,支持早停机制。

参数:

名称 类型 描述 默认
x array - like or DataFrame

训练集特征

必需
y array - like

训练集标签

必需
valx array - like or DataFrame

验证集特征

必需
valy array - like

验证集标签

必需
wgt array - like

样本权重

None
init_score array - like

初始化分数

None

返回:

类型 描述
self
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def fit(self, x, y, valx, valy, wgt=None, init_score=None, sample_weight=None, eval_sample_weight=None):
    """训练LightGBM模型。

    使用训练集和验证集训练模型,支持早停机制。

    Parameters
    ----------
    x : array-like or pd.DataFrame
        训练集特征
    y : array-like
        训练集标签
    valx : array-like or pd.DataFrame
        验证集特征
    valy : array-like
        验证集标签
    wgt : array-like, optional
        样本权重
    init_score : array-like, optional
        初始化分数

    Returns
    -------
    self
    """
    if wgt is None:
        wgt = sample_weight
    self.model = lgb_model(
        x=x, y=y, valx=valx, valy=valy,
        params_dict=self.params, wgt=wgt, init_score=init_score,
        eval_sample_weight=eval_sample_weight
    )
    if hasattr(x, 'columns'):
        self.feature_names_ = list(x.columns)
    return self

predict

predict(x)

预测样本的概率。

参数:

名称 类型 描述 默认
x array - like or DataFrame

预测特征

必需

返回:

类型 描述
ndarray

预测概率

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def predict(self, x):
    """预测样本的概率。

    Parameters
    ----------
    x : array-like or pd.DataFrame
        预测特征

    Returns
    -------
    np.ndarray
        预测概率
    """
    return self.model.predict_proba(x)[:, 1]

get_feature_importance

get_feature_importance(importance_type='gain')

获取特征重要性。

参数:

名称 类型 描述 默认
importance_type str

特征重要性类型,可选 'gain' 或 'split'

'gain'

返回:

类型 描述
DataFrame

包含 feature 和 importance 列的DataFrame

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def get_feature_importance(self, importance_type='gain'):
    """获取特征重要性。

    Parameters
    ----------
    importance_type : str, default 'gain'
        特征重要性类型,可选 'gain' 或 'split'

    Returns
    -------
    pd.DataFrame
        包含 feature 和 importance 列的DataFrame
    """
    return lgb_varimp(self.model)

save

save(path)

保存模型。

参数:

名称 类型 描述 默认
path str

模型保存路径

必需
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def save(self, path):
    """保存模型。

    Parameters
    ----------
    path : str
        模型保存路径
    """
    save_model(self.model, path)

load

load(path)

加载模型。

参数:

名称 类型 描述 默认
path str

模型文件路径

必需

返回:

类型 描述
self
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def load(self, path):
    """加载模型。

    Parameters
    ----------
    path : str
        模型文件路径

    Returns
    -------
    self
    """
    self.model = load_model(path)
    return self

calibrate

calibrate(x, y, method='sigmoid', cv='prefit')

模型概率校准。

参数:

名称 类型 描述 默认
x array - like

校准特征

必需
y array - like

校准标签

必需
method str

校准方法,'sigmoid' 或 'isotonic'

'sigmoid'
cv str or int

交叉验证方式

'prefit'

返回:

类型 描述
self
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def calibrate(self, x, y, method='sigmoid', cv='prefit'):
    """模型概率校准。

    Parameters
    ----------
    x : array-like
        校准特征
    y : array-like
        校准标签
    method : str, default 'sigmoid'
        校准方法,'sigmoid' 或 'isotonic'
    cv : str or int, default 'prefit'
        交叉验证方式

    Returns
    -------
    self
    """
    self.model = CalibratedClassifierCV(self.model, method=method, cv=cv)
    self.model.fit(x, y)
    return self

calibration_curve

calibration_curve(x, y, n_bins=10)

获取校准曲线数据。

参数:

名称 类型 描述 默认
x array - like

特征

必需
y array - like

标签

必需
n_bins int

分箱数

10

返回:

类型 描述
tuple

(fraction_of_positives, mean_predicted_value)

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def calibration_curve(self, x, y, n_bins=10):
    """获取校准曲线数据。

    Parameters
    ----------
    x : array-like
        特征
    y : array-like
        标签
    n_bins : int, default 10
        分箱数

    Returns
    -------
    tuple
        (fraction_of_positives, mean_predicted_value)
    """
    y_prob = self.predict(x)
    return calibration_curve(y, y_prob, n_bins=n_bins)

brier_score

brier_score(x, y)

计算Brier分数。

参数:

名称 类型 描述 默认
x array - like

特征

必需
y array - like

标签

必需

返回:

类型 描述
float

Brier分数

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def brier_score(self, x, y):
    """计算Brier分数。

    Parameters
    ----------
    x : array-like
        特征
    y : array-like
        标签

    Returns
    -------
    float
        Brier分数
    """
    y_prob = self.predict(x)
    return brier_score_loss(y, y_prob)

roc_auc

roc_auc(x, y)

计算ROC AUC。

参数:

名称 类型 描述 默认
x array - like

特征

必需
y array - like

标签

必需

返回:

类型 描述
float

ROC AUC分数

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def roc_auc(self, x, y):
    """计算ROC AUC。

    Parameters
    ----------
    x : array-like
        特征
    y : array-like
        标签

    Returns
    -------
    float
        ROC AUC分数
    """
    y_prob = self.predict(x)
    return roc_auc_score(y, y_prob)

XGBoostModel

XGBoost模型封装类。

提供统一的XGBoost模型训练、预测、保存和加载接口。 支持模型校准、特征重要性获取等功能。

参数:

名称 类型 描述 默认
params dict

XGBoost模型参数字典

必需
model XGBClassifier

预加载的模型实例

None

示例:

>>> xgb_clf = XGBoostModel(params)
>>> xgb_clf.fit(x_train, y_train, x_val, y_val)
>>> preds = xgb_clf.predict(x_test)
源代码位于: Modeling_Tool/Model/GBM_Tool.py
class XGBoostModel:
    """
    XGBoost模型封装类。

    提供统一的XGBoost模型训练、预测、保存和加载接口。
    支持模型校准、特征重要性获取等功能。

    Parameters
    ----------
    params : dict
        XGBoost模型参数字典
    model : xgb.XGBClassifier, optional
        预加载的模型实例

    Examples
    --------
    >>> xgb_clf = XGBoostModel(params)
    >>> xgb_clf.fit(x_train, y_train, x_val, y_val)
    >>> preds = xgb_clf.predict(x_test)
    """

    def __init__(self, params, model=None):
        """
        初始化XGBoost模型封装类。

        Parameters
        ----------
        params : dict
            XGBoost模型参数字典
        model : xgb.XGBClassifier, optional
            预加载的模型实例
        """
        xgb = _get_xgb()
        self.params = params
        self.model = model
        self.feature_names_ = None

    def fit(self, x, y, valx, valy, sample_weight=None, sample_weight_eval_set=None, base_margin=None):
        """训练XGBoost模型。

        Parameters
        ----------
        x : array-like or pd.DataFrame
            训练集特征
        y : array-like
            训练集标签
        valx : array-like or pd.DataFrame
            验证集特征
        valy : array-like
            验证集标签
        sample_weight : array-like, optional
            样本权重
        sample_weight_eval_set : list, optional
            验证集样本权重列表
        base_margin : array-like, optional
            基础边际(init_score / log-odds 偏移),用于增量训练(warm-start)。

        Returns
        -------
        self
        """
        self.model = xgb_model(
            x=x, y=y, valx=valx, valy=valy,
            params_dict=self.params,
            sample_weight=sample_weight,
            sample_weight_eval_set=sample_weight_eval_set,
            base_margin=base_margin
        )
        if hasattr(x, 'columns'):
            self.feature_names_ = list(x.columns)
        return self

    def predict(self, x):
        """预测样本的概率。

        Parameters
        ----------
        x : array-like or pd.DataFrame
            预测特征

        Returns
        -------
        np.ndarray
            预测概率
        """
        return self.model.predict_proba(x)[:, 1]

    def get_feature_importance(self, importance_type='gain'):
        """获取特征重要性。

        Parameters
        ----------
        importance_type : str, default 'gain'
            特征重要性类型

        Returns
        -------
        pd.DataFrame
            包含 feature 和 importance 列的DataFrame
        """
        return xgb_varimp(self.model)

    def save(self, path):
        """保存模型。

        Parameters
        ----------
        path : str
            模型保存路径
        """
        save_model(self.model, path)

    def load(self, path):
        """加载模型。

        Parameters
        ----------
        path : str
            模型文件路径

        Returns
        -------
        self
        """
        self.model = load_model(path)
        return self

    def calibrate(self, x, y, method='sigmoid', cv='prefit'):
        """模型概率校准。

        Parameters
        ----------
        x : array-like
            校准特征
        y : array-like
            校准标签
        method : str, default 'sigmoid'
            校准方法
        cv : str or int, default 'prefit'
            交叉验证方式

        Returns
        -------
        self
        """
        self.model = CalibratedClassifierCV(self.model, method=method, cv=cv)
        self.model.fit(x, y)
        return self

    def calibration_curve(self, x, y, n_bins=10):
        """获取校准曲线数据。

        Parameters
        ----------
        x : array-like
            特征
        y : array-like
            标签

        Returns
        -------
        tuple
        """
        y_prob = self.predict(x)
        return calibration_curve(y, y_prob, n_bins=n_bins)

    def brier_score(self, x, y):
        """计算Brier分数。

        Parameters
        ----------
        x : array-like
        y : array-like

        Returns
        -------
        float
        """
        y_prob = self.predict(x)
        return brier_score_loss(y, y_prob)

    def roc_auc(self, x, y):
        """计算ROC AUC。

        Parameters
        ----------
        x : array-like
        y : array-like

        Returns
        -------
        float
        """
        y_prob = self.predict(x)
        return roc_auc_score(y, y_prob)

fit

fit(x, y, valx, valy, sample_weight=None, sample_weight_eval_set=None, base_margin=None)

训练XGBoost模型。

参数:

名称 类型 描述 默认
x array - like or DataFrame

训练集特征

必需
y array - like

训练集标签

必需
valx array - like or DataFrame

验证集特征

必需
valy array - like

验证集标签

必需
sample_weight array - like

样本权重

None
sample_weight_eval_set list

验证集样本权重列表

None
base_margin array - like

基础边际(init_score / log-odds 偏移),用于增量训练(warm-start)。

None

返回:

类型 描述
self
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def fit(self, x, y, valx, valy, sample_weight=None, sample_weight_eval_set=None, base_margin=None):
    """训练XGBoost模型。

    Parameters
    ----------
    x : array-like or pd.DataFrame
        训练集特征
    y : array-like
        训练集标签
    valx : array-like or pd.DataFrame
        验证集特征
    valy : array-like
        验证集标签
    sample_weight : array-like, optional
        样本权重
    sample_weight_eval_set : list, optional
        验证集样本权重列表
    base_margin : array-like, optional
        基础边际(init_score / log-odds 偏移),用于增量训练(warm-start)。

    Returns
    -------
    self
    """
    self.model = xgb_model(
        x=x, y=y, valx=valx, valy=valy,
        params_dict=self.params,
        sample_weight=sample_weight,
        sample_weight_eval_set=sample_weight_eval_set,
        base_margin=base_margin
    )
    if hasattr(x, 'columns'):
        self.feature_names_ = list(x.columns)
    return self

predict

predict(x)

预测样本的概率。

参数:

名称 类型 描述 默认
x array - like or DataFrame

预测特征

必需

返回:

类型 描述
ndarray

预测概率

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def predict(self, x):
    """预测样本的概率。

    Parameters
    ----------
    x : array-like or pd.DataFrame
        预测特征

    Returns
    -------
    np.ndarray
        预测概率
    """
    return self.model.predict_proba(x)[:, 1]

get_feature_importance

get_feature_importance(importance_type='gain')

获取特征重要性。

参数:

名称 类型 描述 默认
importance_type str

特征重要性类型

'gain'

返回:

类型 描述
DataFrame

包含 feature 和 importance 列的DataFrame

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def get_feature_importance(self, importance_type='gain'):
    """获取特征重要性。

    Parameters
    ----------
    importance_type : str, default 'gain'
        特征重要性类型

    Returns
    -------
    pd.DataFrame
        包含 feature 和 importance 列的DataFrame
    """
    return xgb_varimp(self.model)

save

save(path)

保存模型。

参数:

名称 类型 描述 默认
path str

模型保存路径

必需
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def save(self, path):
    """保存模型。

    Parameters
    ----------
    path : str
        模型保存路径
    """
    save_model(self.model, path)

load

load(path)

加载模型。

参数:

名称 类型 描述 默认
path str

模型文件路径

必需

返回:

类型 描述
self
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def load(self, path):
    """加载模型。

    Parameters
    ----------
    path : str
        模型文件路径

    Returns
    -------
    self
    """
    self.model = load_model(path)
    return self

calibrate

calibrate(x, y, method='sigmoid', cv='prefit')

模型概率校准。

参数:

名称 类型 描述 默认
x array - like

校准特征

必需
y array - like

校准标签

必需
method str

校准方法

'sigmoid'
cv str or int

交叉验证方式

'prefit'

返回:

类型 描述
self
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def calibrate(self, x, y, method='sigmoid', cv='prefit'):
    """模型概率校准。

    Parameters
    ----------
    x : array-like
        校准特征
    y : array-like
        校准标签
    method : str, default 'sigmoid'
        校准方法
    cv : str or int, default 'prefit'
        交叉验证方式

    Returns
    -------
    self
    """
    self.model = CalibratedClassifierCV(self.model, method=method, cv=cv)
    self.model.fit(x, y)
    return self

calibration_curve

calibration_curve(x, y, n_bins=10)

获取校准曲线数据。

参数:

名称 类型 描述 默认
x array - like

特征

必需
y array - like

标签

必需

返回:

类型 描述
tuple
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def calibration_curve(self, x, y, n_bins=10):
    """获取校准曲线数据。

    Parameters
    ----------
    x : array-like
        特征
    y : array-like
        标签

    Returns
    -------
    tuple
    """
    y_prob = self.predict(x)
    return calibration_curve(y, y_prob, n_bins=n_bins)

brier_score

brier_score(x, y)

计算Brier分数。

参数:

名称 类型 描述 默认
x array - like
必需
y array - like
必需

返回:

类型 描述
float
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def brier_score(self, x, y):
    """计算Brier分数。

    Parameters
    ----------
    x : array-like
    y : array-like

    Returns
    -------
    float
    """
    y_prob = self.predict(x)
    return brier_score_loss(y, y_prob)

roc_auc

roc_auc(x, y)

计算ROC AUC。

参数:

名称 类型 描述 默认
x array - like
必需
y array - like
必需

返回:

类型 描述
float
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def roc_auc(self, x, y):
    """计算ROC AUC。

    Parameters
    ----------
    x : array-like
    y : array-like

    Returns
    -------
    float
    """
    y_prob = self.predict(x)
    return roc_auc_score(y, y_prob)

CatBoostModel

CatBoost模型封装类。

提供统一的CatBoost模型训练、预测、保存和加载接口。 支持模型校准、特征重要性获取等功能。

参数:

名称 类型 描述 默认
params dict

CatBoost模型参数字典

必需
model CatBoostClassifier

预加载的模型实例

None

示例:

>>> cat_clf = CatBoostModel(params)
>>> cat_clf.fit(x_train, y_train, x_val, y_val)
>>> preds = cat_clf.predict(x_test)
源代码位于: Modeling_Tool/Model/GBM_Tool.py
class CatBoostModel:
    """
    CatBoost模型封装类。

    提供统一的CatBoost模型训练、预测、保存和加载接口。
    支持模型校准、特征重要性获取等功能。

    Parameters
    ----------
    params : dict
        CatBoost模型参数字典
    model : CatBoostClassifier, optional
        预加载的模型实例

    Examples
    --------
    >>> cat_clf = CatBoostModel(params)
    >>> cat_clf.fit(x_train, y_train, x_val, y_val)
    >>> preds = cat_clf.predict(x_test)
    """

    def __init__(self, params, model=None):
        """
        初始化CatBoost模型封装类。

        Parameters
        ----------
        params : dict
            CatBoost模型参数字典
        model : CatBoostClassifier, optional
            预加载的模型实例
        """
        _get_catboost()
        self.params = params
        self.model = model
        self.feature_names_ = None

    def fit(self, x, y, valx, valy, sample_weight=None):
        """训练CatBoost模型。

        Parameters
        ----------
        x : array-like or pd.DataFrame
            训练集特征
        y : array-like
            训练集标签
        valx : array-like or pd.DataFrame
            验证集特征
        valy : array-like
            验证集标签
        sample_weight : array-like, optional
            样本权重

        Returns
        -------
        self
        """
        self.model = catboost_model(
            x=x, y=y, valx=valx, valy=valy,
            params_dict=self.params,
            sample_weight=sample_weight,
        )
        if hasattr(x, 'columns'):
            self.feature_names_ = list(x.columns)
        return self

    def predict(self, x):
        """预测样本的概率。

        Parameters
        ----------
        x : array-like or pd.DataFrame
            预测特征

        Returns
        -------
        np.ndarray
            预测概率
        """
        return self.model.predict_proba(x)[:, 1]

    def get_feature_importance(self, importance_type='gain'):
        """获取特征重要性。

        Parameters
        ----------
        importance_type : str, default 'gain'
            特征重要性类型(CatBoost 使用 PredictionValuesChange)

        Returns
        -------
        pd.DataFrame
            包含 feature 和 importance 列的DataFrame
        """
        return catboost_varimp(self.model)

    def save(self, path):
        """保存模型。

        Parameters
        ----------
        path : str
            模型保存路径
        """
        save_model(self.model, path)

    def load(self, path):
        """加载模型。

        Parameters
        ----------
        path : str
            模型文件路径

        Returns
        -------
        self
        """
        self.model = load_model(path)
        return self

    def calibrate(self, x, y, method='sigmoid', cv='prefit'):
        """模型概率校准。

        Parameters
        ----------
        x : array-like
            校准特征
        y : array-like
            校准标签
        method : str, default 'sigmoid'
            校准方法
        cv : str or int, default 'prefit'
            交叉验证方式

        Returns
        -------
        self
        """
        self.model = CalibratedClassifierCV(self.model, method=method, cv=cv)
        self.model.fit(x, y)
        return self

    def calibration_curve(self, x, y, n_bins=10):
        """获取校准曲线数据。

        Parameters
        ----------
        x : array-like
            特征
        y : array-like
            标签
        n_bins : int, default 10
            分箱数

        Returns
        -------
        tuple
        """
        y_prob = self.predict(x)
        return calibration_curve(y, y_prob, n_bins=n_bins)

    def brier_score(self, x, y):
        """计算Brier分数。

        Parameters
        ----------
        x : array-like
        y : array-like

        Returns
        -------
        float
        """
        y_prob = self.predict(x)
        return brier_score_loss(y, y_prob)

    def roc_auc(self, x, y):
        """计算ROC AUC。

        Parameters
        ----------
        x : array-like
        y : array-like

        Returns
        -------
        float
        """
        y_prob = self.predict(x)
        return roc_auc_score(y, y_prob)

fit

fit(x, y, valx, valy, sample_weight=None)

训练CatBoost模型。

参数:

名称 类型 描述 默认
x array - like or DataFrame

训练集特征

必需
y array - like

训练集标签

必需
valx array - like or DataFrame

验证集特征

必需
valy array - like

验证集标签

必需
sample_weight array - like

样本权重

None

返回:

类型 描述
self
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def fit(self, x, y, valx, valy, sample_weight=None):
    """训练CatBoost模型。

    Parameters
    ----------
    x : array-like or pd.DataFrame
        训练集特征
    y : array-like
        训练集标签
    valx : array-like or pd.DataFrame
        验证集特征
    valy : array-like
        验证集标签
    sample_weight : array-like, optional
        样本权重

    Returns
    -------
    self
    """
    self.model = catboost_model(
        x=x, y=y, valx=valx, valy=valy,
        params_dict=self.params,
        sample_weight=sample_weight,
    )
    if hasattr(x, 'columns'):
        self.feature_names_ = list(x.columns)
    return self

predict

predict(x)

预测样本的概率。

参数:

名称 类型 描述 默认
x array - like or DataFrame

预测特征

必需

返回:

类型 描述
ndarray

预测概率

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def predict(self, x):
    """预测样本的概率。

    Parameters
    ----------
    x : array-like or pd.DataFrame
        预测特征

    Returns
    -------
    np.ndarray
        预测概率
    """
    return self.model.predict_proba(x)[:, 1]

get_feature_importance

get_feature_importance(importance_type='gain')

获取特征重要性。

参数:

名称 类型 描述 默认
importance_type str

特征重要性类型(CatBoost 使用 PredictionValuesChange)

'gain'

返回:

类型 描述
DataFrame

包含 feature 和 importance 列的DataFrame

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def get_feature_importance(self, importance_type='gain'):
    """获取特征重要性。

    Parameters
    ----------
    importance_type : str, default 'gain'
        特征重要性类型(CatBoost 使用 PredictionValuesChange)

    Returns
    -------
    pd.DataFrame
        包含 feature 和 importance 列的DataFrame
    """
    return catboost_varimp(self.model)

save

save(path)

保存模型。

参数:

名称 类型 描述 默认
path str

模型保存路径

必需
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def save(self, path):
    """保存模型。

    Parameters
    ----------
    path : str
        模型保存路径
    """
    save_model(self.model, path)

load

load(path)

加载模型。

参数:

名称 类型 描述 默认
path str

模型文件路径

必需

返回:

类型 描述
self
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def load(self, path):
    """加载模型。

    Parameters
    ----------
    path : str
        模型文件路径

    Returns
    -------
    self
    """
    self.model = load_model(path)
    return self

calibrate

calibrate(x, y, method='sigmoid', cv='prefit')

模型概率校准。

参数:

名称 类型 描述 默认
x array - like

校准特征

必需
y array - like

校准标签

必需
method str

校准方法

'sigmoid'
cv str or int

交叉验证方式

'prefit'

返回:

类型 描述
self
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def calibrate(self, x, y, method='sigmoid', cv='prefit'):
    """模型概率校准。

    Parameters
    ----------
    x : array-like
        校准特征
    y : array-like
        校准标签
    method : str, default 'sigmoid'
        校准方法
    cv : str or int, default 'prefit'
        交叉验证方式

    Returns
    -------
    self
    """
    self.model = CalibratedClassifierCV(self.model, method=method, cv=cv)
    self.model.fit(x, y)
    return self

calibration_curve

calibration_curve(x, y, n_bins=10)

获取校准曲线数据。

参数:

名称 类型 描述 默认
x array - like

特征

必需
y array - like

标签

必需
n_bins int

分箱数

10

返回:

类型 描述
tuple
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def calibration_curve(self, x, y, n_bins=10):
    """获取校准曲线数据。

    Parameters
    ----------
    x : array-like
        特征
    y : array-like
        标签
    n_bins : int, default 10
        分箱数

    Returns
    -------
    tuple
    """
    y_prob = self.predict(x)
    return calibration_curve(y, y_prob, n_bins=n_bins)

brier_score

brier_score(x, y)

计算Brier分数。

参数:

名称 类型 描述 默认
x array - like
必需
y array - like
必需

返回:

类型 描述
float
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def brier_score(self, x, y):
    """计算Brier分数。

    Parameters
    ----------
    x : array-like
    y : array-like

    Returns
    -------
    float
    """
    y_prob = self.predict(x)
    return brier_score_loss(y, y_prob)

roc_auc

roc_auc(x, y)

计算ROC AUC。

参数:

名称 类型 描述 默认
x array - like
必需
y array - like
必需

返回:

类型 描述
float
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def roc_auc(self, x, y):
    """计算ROC AUC。

    Parameters
    ----------
    x : array-like
    y : array-like

    Returns
    -------
    float
    """
    y_prob = self.predict(x)
    return roc_auc_score(y, y_prob)

GradientBoostingModel

统一梯度提升模型封装类。

支持LightGBM、XGBoost和CatBoost三种框架的统一接口。 通过model_type参数切换框架,其他接口保持一致。

参数:

名称 类型 描述 默认
model_type str

模型类型,'lgb'、'xgb' 或 'cat'('catboost' 别名)

必需
params dict

模型参数字典

必需

示例:

>>> model = GradientBoostingModel('lgb', params)
>>> model.fit(x_train, y_train, x_val, y_val)
>>> preds = model.predict(x_test)

增量学习(warm-start)

>>> base_margin = init_model.get_base_margin(x_train)
>>> new_model = GradientBoostingModel('xgb', params)
>>> new_model.fit(x_train, y_train, x_val, y_val, init_score=base_margin)
>>> proba = new_model.predict_with_base_margin(
...     x_score, init_model.get_base_margin(x_score))

适配已训练好的裸估计器(如历史直接 pickle 的 XGBClassifier)

>>> init_model = GradientBoostingModel.from_fitted(load_model(path))
>>> init_model.get_base_margin(x_train)
源代码位于: Modeling_Tool/Model/GBM_Tool.py
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
class GradientBoostingModel:
    """
    统一梯度提升模型封装类。

    支持LightGBM、XGBoost和CatBoost三种框架的统一接口。
    通过model_type参数切换框架,其他接口保持一致。

    Parameters
    ----------
    model_type : str
        模型类型,'lgb'、'xgb' 或 'cat'('catboost' 别名)
    params : dict
        模型参数字典

    Examples
    --------
    >>> model = GradientBoostingModel('lgb', params)
    >>> model.fit(x_train, y_train, x_val, y_val)
    >>> preds = model.predict(x_test)

    # 增量学习(warm-start)
    >>> base_margin = init_model.get_base_margin(x_train)
    >>> new_model = GradientBoostingModel('xgb', params)
    >>> new_model.fit(x_train, y_train, x_val, y_val, init_score=base_margin)
    >>> proba = new_model.predict_with_base_margin(
    ...     x_score, init_model.get_base_margin(x_score))

    # 适配已训练好的裸估计器(如历史直接 pickle 的 XGBClassifier)
    >>> init_model = GradientBoostingModel.from_fitted(load_model(path))
    >>> init_model.get_base_margin(x_train)
    """

    def __init__(self, model_type, params):
        """
        初始化统一模型封装类。

        Parameters
        ----------
        model_type : str
            模型类型,'lgb'、'xgb' 或 'cat'('catboost' 别名)
        params : dict
            模型参数字典

        Raises
        ------
        ValueError
            当model_type不为支持类型时
        """
        if model_type == 'catboost':
            model_type = 'cat'
        if model_type not in ('lgb', 'xgb', 'cat'):
            raise ValueError(
                f"model_type must be 'lgb', 'xgb', or 'cat', got: {model_type!r}"
            )
        self.model_type = model_type
        self.params = params
        if model_type == 'lgb':
            self._model = LightGBMModel(params)
        elif model_type == 'xgb':
            self._model = XGBoostModel(params)
        else:
            self._model = CatBoostModel(params)

    @staticmethod
    def _detect_model_type(estimator):
        """从已训练的估计器类名 / 模块推断 'lgb'、'xgb' 或 'cat'。

        Parameters
        ----------
        estimator : object
            已 fit 的 XGBClassifier / LGBMClassifier / CatBoostClassifier 等。

        Returns
        -------
        str
            'lgb'、'xgb' 或 'cat'。

        Raises
        ------
        ValueError
            无法从类型推断时。
        """
        cls = type(estimator)
        tag = f"{cls.__module__}.{cls.__name__}".lower()
        if 'catboost' in tag:
            return 'cat'
        if 'xgboost' in tag or 'xgb' in tag:
            return 'xgb'
        if 'lightgbm' in tag or 'lgb' in tag:
            return 'lgb'
        raise ValueError(
            f"cannot infer model_type from {cls.__name__!r}; "
            "pass model_type='lgb', 'xgb', or 'cat' explicitly"
        )

    @classmethod
    def from_fitted(cls, model, model_type=None, params=None):
        """用一个【已训练好】的估计器(或封装)构造 GradientBoostingModel。

        用于适配历史上直接以 sklearn 估计器(``XGBClassifier`` /
        ``LGBMClassifier`` / ``CatBoostClassifier``)形式保存的模型,使其无需重训即可使用
        :meth:`get_base_margin` / :meth:`predict_with_base_margin` 等统一接口。

        Parameters
        ----------
        model : object
            已 fit 的 ``XGBClassifier`` / ``LGBMClassifier`` / ``CatBoostClassifier``,或
            ``LightGBMModel`` / ``XGBoostModel`` / ``CatBoostModel`` / ``GradientBoostingModel`` 封装。
        model_type : {'lgb', 'xgb', 'cat'}, optional
            不传则按估计器类型自动推断。
        params : dict, optional
            参数字典;不传则尽量从估计器的 ``get_params()`` 读取。

        Returns
        -------
        GradientBoostingModel

        Raises
        ------
        ValueError
            传入未 fit / 空模型时。
        """
        if isinstance(model, cls):
            try:
                object.__getattribute__(model, '_model')
                return model
            except AttributeError:
                raise ValueError(
                    "Loaded GradientBoostingModel is missing _model (Cython pickle bug "
                    "in a previous package version). Please retrain and re-save the model."
                )
        # 解包 LightGBMModel / XGBoostModel / CatBoostModel(它们把裸估计器放在 .model)
        estimator = getattr(model, 'model', model)
        if estimator is None:
            raise ValueError("from_fitted received an unfitted / empty model")
        mt = model_type or cls._detect_model_type(estimator)
        if mt == 'catboost':
            mt = 'cat'
        if params is None:
            params = estimator.get_params() if hasattr(estimator, 'get_params') else {}
        obj = cls(mt, params)
        obj._model.model = estimator
        feat = _extract_estimator_feature_names(estimator)
        if feat is not None:
            obj._model.feature_names_ = feat
            _ensure_feature_names_in(estimator, feat)
        return obj

    def __getattr__(self, name):
        """把未知属性委托给底层已训练估计器。

        使封装实例成为原始 LGBM/XGB/CatBoost 估计器的超集(如 ``get_params`` /
        ``feature_names_in_`` / ``predict_proba`` 等可直接透传)。仅在常规
        属性查找失败时调用;对 dunder 名与 ``_model`` 缺失(如反序列化
        中途)安全抛出 AttributeError,以免干扰 pickle / copy。
        """
        if name.startswith('__') and name.endswith('__'):
            raise AttributeError(name)
        if name == '_model':
            raise AttributeError(name)
        try:
            model = object.__getattribute__(self, '_model')
        except AttributeError:
            raise AttributeError(name)
        inner = getattr(model, 'model', None)
        if inner is not None and hasattr(inner, name):
            return getattr(inner, name)
        if name == 'feature_names_in_':
            feat = getattr(model, 'feature_names_', None)
            if feat is None and inner is not None:
                feat = _extract_estimator_feature_names(inner)
            if feat is not None:
                return np.asarray(feat, dtype=object)
        raise AttributeError(name)

    def __getstate__(self):
        return {
            'model_type': self.model_type,
            'params': self.params,
            '_model_params': self._model.params,
            '_model_model': self._model.model,
            '_model_feature_names_': getattr(self._model, 'feature_names_', None),
        }

    def __setstate__(self, state):
        self.model_type = state['model_type']
        self.params = state['params']
        mt = self.model_type
        mp = state.get('_model_params', self.params)
        if mt == 'lgb':
            self._model = LightGBMModel(mp)
        elif mt == 'xgb':
            self._model = XGBoostModel(mp)
        else:
            self._model = CatBoostModel(mp)
        self._model.model = state.get('_model_model')
        self._model.feature_names_ = state.get('_model_feature_names_')

    @staticmethod
    def _sigmoid(z):
        """数値稳定的 Sigmoid(log-odds → 概率)。"""
        z = np.clip(np.asarray(z, dtype=float), -709, 709)
        return 1.0 / (1.0 + np.exp(-z))

    def fit(self, x, y, valx, valy, init_score=None, sample_weight=None, eval_sample_weight=None, sample_weight_eval_set=None, **kwargs):
        """训练模型(支持增量学习 warm-start)。

        当传入 ``init_score`` 时,以其作为 log-odds 偏移在新数据上继续训练:
        LightGBM 走 ``init_score``,XGBoost 走 ``base_margin``(两者语义一致,
        本方法统一对外暴露为 ``init_score``)。CatBoost 不支持 ``init_score``。

        Parameters
        ----------
        x : array-like or pd.DataFrame
            训练集特征
        y : array-like
            训练集标签
        valx : array-like or pd.DataFrame
            验证集特征
        valy : array-like
            验证集标签
        init_score : array-like, optional
            初始 log-odds 偏移(增量学习起点)。一般由基准模型的
            :meth:`get_base_margin` 产生。``None`` 时即普通从头训练。
        **kwargs
            其余参数透传给底层模型(如 lgb 的 ``wgt``、xgb 的
            ``sample_weight`` / ``sample_weight_eval_set``、cat 的
            ``sample_weight``)。

        Returns
        -------
        self

        Raises
        ------
        NotImplementedError
            CatBoost 不支持 ``init_score`` 增量学习。

        Notes
        -----
        与既有生产流程一致,偏移仅作用于训练集;验证集未注入偏移,因此早停
        的 eval 指标是在"未加偏移"的空间上评估的。如需严格一致,可后续透传
        lgb 的 ``eval_init_score`` / xgb 的 ``base_margin_eval_set``。
        """
        if self.model_type == 'cat':
            if init_score is not None:
                raise NotImplementedError(
                    "CatBoost does not support init_score in GradientBoostingModel.fit"
                )
            self._model.fit(x, y, valx, valy, sample_weight=sample_weight, **kwargs)
        elif self.model_type == 'lgb':
            self._model.fit(
                x, y, valx, valy,
                init_score=init_score,
                sample_weight=sample_weight,
                eval_sample_weight=eval_sample_weight,
                **kwargs,
            )
        else:
            if sample_weight_eval_set is None and eval_sample_weight is not None:
                sample_weight_eval_set = [eval_sample_weight]
            self._model.fit(
                x, y, valx, valy,
                base_margin=init_score,
                sample_weight=sample_weight,
                sample_weight_eval_set=sample_weight_eval_set,
                **kwargs,
            )
        return self

    def get_base_margin(self, x):
        """返回本模型对 ``x`` 的原始 log-odds(base margin / init score)。

        统一兼容三种框架取"未经 sigmoid 的原始分数":

        - XGBoost: ``predict(x, output_margin=True)``
        - LightGBM: ``predict(x, raw_score=True)``
        - CatBoost: ``predict(x, prediction_type='RawFormulaVal')``

        该结果可作为下一个增量模型 :meth:`fit` 的 ``init_score``,或喂给
        :meth:`predict_with_base_margin` 做融合预测。

        Parameters
        ----------
        x : array-like or pd.DataFrame
            待计算的特征

        Returns
        -------
        np.ndarray
            一维 log-odds 数组,形状 ``(n_samples,)``

        Raises
        ------
        RuntimeError
            当模型尚未训练(``fit`` 之前)时
        """
        est = self._model.model
        if est is None:
            raise RuntimeError("model is not fitted yet; call fit() first")
        if self.model_type == 'lgb':
            margin = est.predict(x, raw_score=True)
        elif self.model_type == 'cat':
            margin = est.predict(x, prediction_type='RawFormulaVal')
        else:
            margin = est.predict(x, output_margin=True)
        return np.asarray(margin).ravel()

    def predict_with_base_margin(self, x, base_margin, return_prob=True):
        """融合预测:``sigmoid(base_margin + 本模型 raw score)``。

        把一个基准模型的 log-odds(``base_margin``,通常来自
        ``init_model.get_base_margin(x)``)与本(增量)模型自身的 raw score
        在 log-odds 空间相加,再做 sigmoid。这种手动融合是唯一对 lgb 与 xgb
        行为一致的方式——LightGBM 在预测期并不支持注入 init_score。

        Parameters
        ----------
        x : array-like or pd.DataFrame
            待预测的特征
        base_margin : array-like
            基准模型的 log-odds 偏移,形状须与 ``x`` 的样本数一致
        return_prob : bool, default True
            ``True`` 返回概率(sigmoid 后);``False`` 返回融合后的原始
            log-odds

        Returns
        -------
        np.ndarray
            一维数组,``return_prob=True`` 时取值于 ``[0, 1]``
        """
        combined = np.asarray(base_margin).ravel() + self.get_base_margin(x)
        return self._sigmoid(combined) if return_prob else combined

    def predict(self, x):
        """预测样本的概率。

        Parameters
        ----------
        x : array-like or pd.DataFrame

        Returns
        -------
        np.ndarray
        """
        return self._model.predict(x)

    def get_feature_importance(self, importance_type='gain'):
        """获取特征重要性。

        Returns
        -------
        pd.DataFrame
        """
        return self._model.get_feature_importance(importance_type=importance_type)

    def save(self, path):
        """保存模型。"""
        self._model.save(path)

    def load(self, path):
        """加载模型。"""
        self._model.load(path)
        return self

    def calibrate(self, x, y, method='sigmoid', cv='prefit'):
        """模型概率校准。"""
        self._model.calibrate(x, y, method=method, cv=cv)
        return self

    def brier_score(self, x, y):
        """计算Brier分数。"""
        return self._model.brier_score(x, y)

    def roc_auc(self, x, y):
        """计算ROC AUC。"""
        return self._model.roc_auc(x, y)

from_fitted classmethod

from_fitted(model, model_type=None, params=None)

用一个【已训练好】的估计器(或封装)构造 GradientBoostingModel。

用于适配历史上直接以 sklearn 估计器(XGBClassifier / LGBMClassifier / CatBoostClassifier)形式保存的模型,使其无需重训即可使用 :meth:get_base_margin / :meth:predict_with_base_margin 等统一接口。

参数:

名称 类型 描述 默认
model object

已 fit 的 XGBClassifier / LGBMClassifier / CatBoostClassifier,或 LightGBMModel / XGBoostModel / CatBoostModel / GradientBoostingModel 封装。

必需
model_type (lgb, xgb, cat)

不传则按估计器类型自动推断。

'lgb'
params dict

参数字典;不传则尽量从估计器的 get_params() 读取。

None

返回:

类型 描述
GradientBoostingModel

引发:

类型 描述
ValueError

传入未 fit / 空模型时。

源代码位于: Modeling_Tool/Model/GBM_Tool.py
@classmethod
def from_fitted(cls, model, model_type=None, params=None):
    """用一个【已训练好】的估计器(或封装)构造 GradientBoostingModel。

    用于适配历史上直接以 sklearn 估计器(``XGBClassifier`` /
    ``LGBMClassifier`` / ``CatBoostClassifier``)形式保存的模型,使其无需重训即可使用
    :meth:`get_base_margin` / :meth:`predict_with_base_margin` 等统一接口。

    Parameters
    ----------
    model : object
        已 fit 的 ``XGBClassifier`` / ``LGBMClassifier`` / ``CatBoostClassifier``,或
        ``LightGBMModel`` / ``XGBoostModel`` / ``CatBoostModel`` / ``GradientBoostingModel`` 封装。
    model_type : {'lgb', 'xgb', 'cat'}, optional
        不传则按估计器类型自动推断。
    params : dict, optional
        参数字典;不传则尽量从估计器的 ``get_params()`` 读取。

    Returns
    -------
    GradientBoostingModel

    Raises
    ------
    ValueError
        传入未 fit / 空模型时。
    """
    if isinstance(model, cls):
        try:
            object.__getattribute__(model, '_model')
            return model
        except AttributeError:
            raise ValueError(
                "Loaded GradientBoostingModel is missing _model (Cython pickle bug "
                "in a previous package version). Please retrain and re-save the model."
            )
    # 解包 LightGBMModel / XGBoostModel / CatBoostModel(它们把裸估计器放在 .model)
    estimator = getattr(model, 'model', model)
    if estimator is None:
        raise ValueError("from_fitted received an unfitted / empty model")
    mt = model_type or cls._detect_model_type(estimator)
    if mt == 'catboost':
        mt = 'cat'
    if params is None:
        params = estimator.get_params() if hasattr(estimator, 'get_params') else {}
    obj = cls(mt, params)
    obj._model.model = estimator
    feat = _extract_estimator_feature_names(estimator)
    if feat is not None:
        obj._model.feature_names_ = feat
        _ensure_feature_names_in(estimator, feat)
    return obj

fit

fit(x, y, valx, valy, init_score=None, sample_weight=None, eval_sample_weight=None, sample_weight_eval_set=None, **kwargs)

训练模型(支持增量学习 warm-start)。

当传入 init_score 时,以其作为 log-odds 偏移在新数据上继续训练: LightGBM 走 init_score,XGBoost 走 base_margin(两者语义一致, 本方法统一对外暴露为 init_score)。CatBoost 不支持 init_score

参数:

名称 类型 描述 默认
x array - like or DataFrame

训练集特征

必需
y array - like

训练集标签

必需
valx array - like or DataFrame

验证集特征

必需
valy array - like

验证集标签

必需
init_score array - like

初始 log-odds 偏移(增量学习起点)。一般由基准模型的 :meth:get_base_margin 产生。None 时即普通从头训练。

None
**kwargs

其余参数透传给底层模型(如 lgb 的 wgt、xgb 的 sample_weight / sample_weight_eval_set、cat 的 sample_weight)。

{}

返回:

类型 描述
self

引发:

类型 描述
NotImplementedError

CatBoost 不支持 init_score 增量学习。

Notes

与既有生产流程一致,偏移仅作用于训练集;验证集未注入偏移,因此早停 的 eval 指标是在"未加偏移"的空间上评估的。如需严格一致,可后续透传 lgb 的 eval_init_score / xgb 的 base_margin_eval_set

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def fit(self, x, y, valx, valy, init_score=None, sample_weight=None, eval_sample_weight=None, sample_weight_eval_set=None, **kwargs):
    """训练模型(支持增量学习 warm-start)。

    当传入 ``init_score`` 时,以其作为 log-odds 偏移在新数据上继续训练:
    LightGBM 走 ``init_score``,XGBoost 走 ``base_margin``(两者语义一致,
    本方法统一对外暴露为 ``init_score``)。CatBoost 不支持 ``init_score``。

    Parameters
    ----------
    x : array-like or pd.DataFrame
        训练集特征
    y : array-like
        训练集标签
    valx : array-like or pd.DataFrame
        验证集特征
    valy : array-like
        验证集标签
    init_score : array-like, optional
        初始 log-odds 偏移(增量学习起点)。一般由基准模型的
        :meth:`get_base_margin` 产生。``None`` 时即普通从头训练。
    **kwargs
        其余参数透传给底层模型(如 lgb 的 ``wgt``、xgb 的
        ``sample_weight`` / ``sample_weight_eval_set``、cat 的
        ``sample_weight``)。

    Returns
    -------
    self

    Raises
    ------
    NotImplementedError
        CatBoost 不支持 ``init_score`` 增量学习。

    Notes
    -----
    与既有生产流程一致,偏移仅作用于训练集;验证集未注入偏移,因此早停
    的 eval 指标是在"未加偏移"的空间上评估的。如需严格一致,可后续透传
    lgb 的 ``eval_init_score`` / xgb 的 ``base_margin_eval_set``。
    """
    if self.model_type == 'cat':
        if init_score is not None:
            raise NotImplementedError(
                "CatBoost does not support init_score in GradientBoostingModel.fit"
            )
        self._model.fit(x, y, valx, valy, sample_weight=sample_weight, **kwargs)
    elif self.model_type == 'lgb':
        self._model.fit(
            x, y, valx, valy,
            init_score=init_score,
            sample_weight=sample_weight,
            eval_sample_weight=eval_sample_weight,
            **kwargs,
        )
    else:
        if sample_weight_eval_set is None and eval_sample_weight is not None:
            sample_weight_eval_set = [eval_sample_weight]
        self._model.fit(
            x, y, valx, valy,
            base_margin=init_score,
            sample_weight=sample_weight,
            sample_weight_eval_set=sample_weight_eval_set,
            **kwargs,
        )
    return self

get_base_margin

get_base_margin(x)

返回本模型对 x 的原始 log-odds(base margin / init score)。

统一兼容三种框架取"未经 sigmoid 的原始分数":

  • XGBoost: predict(x, output_margin=True)
  • LightGBM: predict(x, raw_score=True)
  • CatBoost: predict(x, prediction_type='RawFormulaVal')

该结果可作为下一个增量模型 :meth:fitinit_score,或喂给 :meth:predict_with_base_margin 做融合预测。

参数:

名称 类型 描述 默认
x array - like or DataFrame

待计算的特征

必需

返回:

类型 描述
ndarray

一维 log-odds 数组,形状 (n_samples,)

引发:

类型 描述
RuntimeError

当模型尚未训练(fit 之前)时

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def get_base_margin(self, x):
    """返回本模型对 ``x`` 的原始 log-odds(base margin / init score)。

    统一兼容三种框架取"未经 sigmoid 的原始分数":

    - XGBoost: ``predict(x, output_margin=True)``
    - LightGBM: ``predict(x, raw_score=True)``
    - CatBoost: ``predict(x, prediction_type='RawFormulaVal')``

    该结果可作为下一个增量模型 :meth:`fit` 的 ``init_score``,或喂给
    :meth:`predict_with_base_margin` 做融合预测。

    Parameters
    ----------
    x : array-like or pd.DataFrame
        待计算的特征

    Returns
    -------
    np.ndarray
        一维 log-odds 数组,形状 ``(n_samples,)``

    Raises
    ------
    RuntimeError
        当模型尚未训练(``fit`` 之前)时
    """
    est = self._model.model
    if est is None:
        raise RuntimeError("model is not fitted yet; call fit() first")
    if self.model_type == 'lgb':
        margin = est.predict(x, raw_score=True)
    elif self.model_type == 'cat':
        margin = est.predict(x, prediction_type='RawFormulaVal')
    else:
        margin = est.predict(x, output_margin=True)
    return np.asarray(margin).ravel()

predict_with_base_margin

predict_with_base_margin(x, base_margin, return_prob=True)

融合预测:sigmoid(base_margin + 本模型 raw score)

把一个基准模型的 log-odds(base_margin,通常来自 init_model.get_base_margin(x))与本(增量)模型自身的 raw score 在 log-odds 空间相加,再做 sigmoid。这种手动融合是唯一对 lgb 与 xgb 行为一致的方式——LightGBM 在预测期并不支持注入 init_score。

参数:

名称 类型 描述 默认
x array - like or DataFrame

待预测的特征

必需
base_margin array - like

基准模型的 log-odds 偏移,形状须与 x 的样本数一致

必需
return_prob bool

True 返回概率(sigmoid 后);False 返回融合后的原始 log-odds

True

返回:

类型 描述
ndarray

一维数组,return_prob=True 时取值于 [0, 1]

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def predict_with_base_margin(self, x, base_margin, return_prob=True):
    """融合预测:``sigmoid(base_margin + 本模型 raw score)``。

    把一个基准模型的 log-odds(``base_margin``,通常来自
    ``init_model.get_base_margin(x)``)与本(增量)模型自身的 raw score
    在 log-odds 空间相加,再做 sigmoid。这种手动融合是唯一对 lgb 与 xgb
    行为一致的方式——LightGBM 在预测期并不支持注入 init_score。

    Parameters
    ----------
    x : array-like or pd.DataFrame
        待预测的特征
    base_margin : array-like
        基准模型的 log-odds 偏移,形状须与 ``x`` 的样本数一致
    return_prob : bool, default True
        ``True`` 返回概率(sigmoid 后);``False`` 返回融合后的原始
        log-odds

    Returns
    -------
    np.ndarray
        一维数组,``return_prob=True`` 时取值于 ``[0, 1]``
    """
    combined = np.asarray(base_margin).ravel() + self.get_base_margin(x)
    return self._sigmoid(combined) if return_prob else combined

predict

predict(x)

预测样本的概率。

参数:

名称 类型 描述 默认
x array - like or DataFrame
必需

返回:

类型 描述
ndarray
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def predict(self, x):
    """预测样本的概率。

    Parameters
    ----------
    x : array-like or pd.DataFrame

    Returns
    -------
    np.ndarray
    """
    return self._model.predict(x)

get_feature_importance

get_feature_importance(importance_type='gain')

获取特征重要性。

返回:

类型 描述
DataFrame
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def get_feature_importance(self, importance_type='gain'):
    """获取特征重要性。

    Returns
    -------
    pd.DataFrame
    """
    return self._model.get_feature_importance(importance_type=importance_type)

save

save(path)

保存模型。

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def save(self, path):
    """保存模型。"""
    self._model.save(path)

load

load(path)

加载模型。

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def load(self, path):
    """加载模型。"""
    self._model.load(path)
    return self

calibrate

calibrate(x, y, method='sigmoid', cv='prefit')

模型概率校准。

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def calibrate(self, x, y, method='sigmoid', cv='prefit'):
    """模型概率校准。"""
    self._model.calibrate(x, y, method=method, cv=cv)
    return self

brier_score

brier_score(x, y)

计算Brier分数。

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def brier_score(self, x, y):
    """计算Brier分数。"""
    return self._model.brier_score(x, y)

roc_auc

roc_auc(x, y)

计算ROC AUC。

源代码位于: Modeling_Tool/Model/GBM_Tool.py
def roc_auc(self, x, y):
    """计算ROC AUC。"""
    return self._model.roc_auc(x, y)

set_num_leaves

set_num_leaves(max_depth=5, wgt=1)

根据最大深度设置叶子节点数,避免过拟合。

根据给定的最大深度和权重系数,计算合适的叶子节点数量。 计算公式:2^max_depth - 2^max_depth * wgt

参数:

名称 类型 描述 默认
max_depth int

树的最大深度

5
wgt float

权重系数,取値范围[0, 1]

1

返回:

类型 描述
int

建议的叶子节点数

示例:

>>> set_num_leaves(max_depth=5)
32
>>> set_num_leaves(max_depth=5, wgt=0.5)
16
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def set_num_leaves(max_depth=5, wgt=1):
    """根据最大深度设置叶子节点数,避免过拟合。

    根据给定的最大深度和权重系数,计算合适的叶子节点数量。
    计算公式:2^max_depth - 2^max_depth * wgt

    Parameters
    ----------
    max_depth : int, default 5
        树的最大深度
    wgt : float, default 1
        权重系数,取値范围[0, 1]

    Returns
    -------
    int
        建议的叶子节点数

    Examples
    --------
    >>> set_num_leaves(max_depth=5)
    32
    >>> set_num_leaves(max_depth=5, wgt=0.5)
    16
    """
    return int(2 ** max_depth - 2 ** max_depth * wgt)

lgb_model

lgb_model(x, y, valx, valy, params_dict, wgt=None, init_score=None, eval_sample_weight=None)

快速训练LightGBM模型。

使用训练集和验证集训练LightGBM模型,支持早停机制。

参数:

名称 类型 描述 默认
x array - like or DataFrame

训练集特征

必需
y array - like

训练集标签

必需
valx array - like or DataFrame

验证集特征

必需
valy array - like

验证集标签

必需
params_dict dict

LightGBM参数字典

必需
wgt array - like

样本权重

None
init_score array - like

初始化分数

None

返回:

类型 描述
LGBMClassifier

训练好的LightGBM模型

示例:

>>> params = {
...     'n_estimators': 100,
...     'max_depth': 5,
...     'learning_rate': 0.1,
...     'early_stopping_rounds': 20,
...     'eval_metric': 'auc'
... }
>>> model = lgb_model(x_train, y_train, x_val, y_val, params)
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def lgb_model(x, y, valx, valy, params_dict, wgt=None, init_score=None, eval_sample_weight=None):
    """快速训练LightGBM模型。

    使用训练集和验证集训练LightGBM模型,支持早停机制。

    Parameters
    ----------
    x : array-like or pd.DataFrame
        训练集特征
    y : array-like
        训练集标签
    valx : array-like or pd.DataFrame
        验证集特征
    valy : array-like
        验证集标签
    params_dict : dict
        LightGBM参数字典
    wgt : array-like, optional
        样本权重
    init_score : array-like, optional
        初始化分数

    Returns
    -------
    lgb.LGBMClassifier
        训练好的LightGBM模型

    Examples
    --------
    >>> params = {
    ...     'n_estimators': 100,
    ...     'max_depth': 5,
    ...     'learning_rate': 0.1,
    ...     'early_stopping_rounds': 20,
    ...     'eval_metric': 'auc'
    ... }
    >>> model = lgb_model(x_train, y_train, x_val, y_val, params)
    """
    lgb = _get_lgb()

    lgb_params = {k: v for k, v in params_dict.items() if k != 'eval_metric'}
    model = lgb.LGBMClassifier(**lgb_params)
    # NOTE: `verbose=False` was removed from .fit() in lightgbm>=4. Log control
    # is now done via callbacks only (log_evaluation).
    model.fit(
        x, y,
        eval_set=[(valx, valy)],
        eval_metric=params_dict['eval_metric'] if 'eval_metric' in params_dict else params_dict.get('metric', 'auc'),
        callbacks=[
            lgb.early_stopping(stopping_rounds=params_dict['early_stopping_rounds'], verbose=False),
            lgb.log_evaluation(period=0),
        ],
        sample_weight=wgt,
        eval_sample_weight=[eval_sample_weight] if eval_sample_weight is not None else None,
        init_score=init_score
    )
    return model

lgb_varimp

lgb_varimp(model)

获取LightGBM模型特征重要性。

返回按特征重要性排序的DataFrame。

参数:

名称 类型 描述 默认
model LGBMClassifier

训练好的LightGBM模型

必需

返回:

类型 描述
DataFrame

包含 feature 和 importance 列的DataFrame,按重要性降序排列

示例:

>>> varimp = lgb_varimp(model)
>>> varimp.head(10)
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def lgb_varimp(model):
    """获取LightGBM模型特征重要性。

    返回按特征重要性排序的DataFrame。

    Parameters
    ----------
    model : lgb.LGBMClassifier
        训练好的LightGBM模型

    Returns
    -------
    pd.DataFrame
        包含 feature 和 importance 列的DataFrame,按重要性降序排列

    Examples
    --------
    >>> varimp = lgb_varimp(model)
    >>> varimp.head(10)
    """
    feature_names = model.booster_.feature_name()
    importance = model.booster_.feature_importance(importance_type='gain')
    varimp_df = pd.DataFrame({'feature': feature_names, 'importance': importance})
    varimp_df = varimp_df.sort_values('importance', ascending=False).reset_index(drop=True)
    return varimp_df

lgbm_quick_train

lgbm_quick_train(train_data, validation_data, x, y, params, wgt_col=None, val_wgt_col=None, cat_x_train=None)

快速训练LightGBM模型(使用DataFrame接口)。

接受DataFrame格式的训练集和验证集,自动提取特征和标签。

参数:

名称 类型 描述 默认
train_data DataFrame

训练数据集

必需
validation_data DataFrame

验证数据集

必需
x list of str

特征列名列表

必需
y str

目标变量列名

必需
params dict

LightGBM参数字典

必需
wgt_col str

样本权重列名

None
cat_x_train list of str

类别型特征列名列表

None

返回:

类型 描述
LGBMClassifier

训练好的LightGBM模型

示例:

>>> model = lgbm_quick_train(
...     train_data=train_df,
...     validation_data=val_df,
...     x=['feat1', 'feat2'],
...     y='target',
...     params=params_dict
... )
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def lgbm_quick_train(train_data, validation_data, x, y, params, wgt_col = None, val_wgt_col = None, cat_x_train=None):
    """快速训练LightGBM模型(使用DataFrame接口)。

    接受DataFrame格式的训练集和验证集,自动提取特征和标签。

    Parameters
    ----------
    train_data : pd.DataFrame
        训练数据集
    validation_data : pd.DataFrame
        验证数据集
    x : list of str
        特征列名列表
    y : str
        目标变量列名
    params : dict
        LightGBM参数字典
    wgt_col : str, optional
        样本权重列名
    cat_x_train : list of str, optional
        类别型特征列名列表

    Returns
    -------
    lgb.LGBMClassifier
        训练好的LightGBM模型

    Examples
    --------
    >>> model = lgbm_quick_train(
    ...     train_data=train_df,
    ...     validation_data=val_df,
    ...     x=['feat1', 'feat2'],
    ...     y='target',
    ...     params=params_dict
    ... )
    """
    lgb = _get_lgb()

    wgt = train_data[wgt_col] if wgt_col is not None else None
    eval_wgt = validation_data[val_wgt_col] if val_wgt_col is not None else None
    model = lgb_model(
        x=train_data[x],
        y=train_data[y],
        valx=validation_data[x],
        valy=validation_data[y],
        params_dict=params,
        wgt=wgt,
        eval_sample_weight=eval_wgt
    )
    return model

xgb_model

xgb_model(x, y, valx, valy, params_dict, sample_weight=None, sample_weight_eval_set=None, base_margin=None)

训练XGBoost模型。

使用训练集和验证集训练XGBoost模型,支持早停机制。

参数:

名称 类型 描述 默认
x array - like or DataFrame

训练集特征

必需
y array - like

训练集标签

必需
valx array - like or DataFrame

验证集特征

必需
valy array - like

验证集标签

必需
params_dict dict

XGBoost参数字典

必需
sample_weight array - like

训练集样本权重

None
sample_weight_eval_set list

验证集样本权重列表

None
base_margin array - like

基础边际度

None

返回:

类型 描述
XGBClassifier

训练好的XGBoost模型

示例:

>>> params = {
...     'n_estimators': 100,
...     'max_depth': 5,
...     'learning_rate': 0.1,
...     'early_stopping_rounds': 20,
...     'eval_metric': 'auc'
... }
>>> model = xgb_model(x_train, y_train, x_val, y_val, params)
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def xgb_model(x, y, valx, valy, params_dict, sample_weight=None, sample_weight_eval_set=None, base_margin=None):
    """训练XGBoost模型。

    使用训练集和验证集训练XGBoost模型,支持早停机制。

    Parameters
    ----------
    x : array-like or pd.DataFrame
        训练集特征
    y : array-like
        训练集标签
    valx : array-like or pd.DataFrame
        验证集特征
    valy : array-like
        验证集标签
    params_dict : dict
        XGBoost参数字典
    sample_weight : array-like, optional
        训练集样本权重
    sample_weight_eval_set : list, optional
        验证集样本权重列表
    base_margin : array-like, optional
        基础边际度

    Returns
    -------
    xgb.XGBClassifier
        训练好的XGBoost模型

    Examples
    --------
    >>> params = {
    ...     'n_estimators': 100,
    ...     'max_depth': 5,
    ...     'learning_rate': 0.1,
    ...     'early_stopping_rounds': 20,
    ...     'eval_metric': 'auc'
    ... }
    >>> model = xgb_model(x_train, y_train, x_val, y_val, params)
    """
    xgb = _get_xgb()

    xgb_params = {k: v for k, v in params_dict.items() if k not in ('eval_metric',)}
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(
        x, y,
        eval_set=[(valx, valy)],
        verbose=False,
        sample_weight=sample_weight,
        sample_weight_eval_set=sample_weight_eval_set,
        base_margin=base_margin
    )
    return model

xgb_varimp

xgb_varimp(model)

获取XGBoost模型特征重要性。

返回按特征重要性排序的DataFrame。

参数:

名称 类型 描述 默认
model XGBClassifier

训练好的XGBoost模型

必需

返回:

类型 描述
DataFrame

包含 feature 和 importance 列的DataFrame,按重要性降序排列

示例:

>>> varimp = xgb_varimp(model)
>>> varimp.head(10)
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def xgb_varimp(model):
    """获取XGBoost模型特征重要性。

    返回按特征重要性排序的DataFrame。

    Parameters
    ----------
    model : xgb.XGBClassifier
        训练好的XGBoost模型

    Returns
    -------
    pd.DataFrame
        包含 feature 和 importance 列的DataFrame,按重要性降序排列

    Examples
    --------
    >>> varimp = xgb_varimp(model)
    >>> varimp.head(10)
    """
    importance = model.get_booster().get_fscore()
    varimp_df = pd.DataFrame(
        list(importance.items()),
        columns=['feature', 'importance']
    ).sort_values('importance', ascending=False).reset_index(drop=True)
    return varimp_df

xgbm_quick_train

xgbm_quick_train(train_data, validation_data, x, y, wgt_col=None, params=None, sample_weight_eval_set=None, val_wgt_col=None)

快速训练XGBoost模型(使用DataFrame接口)。

接受DataFrame格式的训练集和验证集,自动提取特征和标签。

参数:

名称 类型 描述 默认
train_data DataFrame

训练数据集

必需
validation_data DataFrame

验证数据集

必需
x list of str

特征列名列表

必需
y str

目标变量列名

必需
wgt_col str

样本权重列名

None
params dict

XGBoost参数字典

None
sample_weight_eval_set list

验证集样本权重列表

None

返回:

类型 描述
XGBClassifier

训练好的XGBoost模型

示例:

>>> model = xgbm_quick_train(
...     train_data=train_df,
...     validation_data=val_df,
...     x=['feat1', 'feat2'],
...     y='target',
...     wgt_col='weight',
...     params=params_dict
... )
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def xgbm_quick_train(train_data, validation_data, x, y, wgt_col=None, params=None,
                     sample_weight_eval_set=None, val_wgt_col=None):
    """快速训练XGBoost模型(使用DataFrame接口)。

    接受DataFrame格式的训练集和验证集,自动提取特征和标签。

    Parameters
    ----------
    train_data : pd.DataFrame
        训练数据集
    validation_data : pd.DataFrame
        验证数据集
    x : list of str
        特征列名列表
    y : str
        目标变量列名
    wgt_col : str
        样本权重列名
    params : dict
        XGBoost参数字典
    sample_weight_eval_set : list, optional
        验证集样本权重列表

    Returns
    -------
    xgb.XGBClassifier
        训练好的XGBoost模型

    Examples
    --------
    >>> model = xgbm_quick_train(
    ...     train_data=train_df,
    ...     validation_data=val_df,
    ...     x=['feat1', 'feat2'],
    ...     y='target',
    ...     wgt_col='weight',
    ...     params=params_dict
    ... )
    """
    xgb = _get_xgb()

    wgt = train_data[wgt_col] if wgt_col is not None else None
    if sample_weight_eval_set is None and val_wgt_col is not None:
        sample_weight_eval_set = [validation_data[val_wgt_col]]
    model = xgb_model(
        x=train_data[x],
        y=train_data[y],
        valx=validation_data[x],
        valy=validation_data[y],
        params_dict=params,
        sample_weight=wgt,
        sample_weight_eval_set=sample_weight_eval_set
    )
    return model

catboost_model

catboost_model(x, y, valx, valy, params_dict, sample_weight=None)

训练CatBoost模型。

使用训练集和验证集训练CatBoost模型,支持早停机制。

参数:

名称 类型 描述 默认
x array - like or DataFrame

训练集特征

必需
y array - like

训练集标签

必需
valx array - like or DataFrame

验证集特征

必需
valy array - like

验证集标签

必需
params_dict dict

CatBoost参数字典(支持 n_estimators / max_depth / random_state 别名)

必需
sample_weight array - like

训练集样本权重

None

返回:

类型 描述
CatBoostClassifier

训练好的CatBoost模型

示例:

>>> params = {
...     'n_estimators': 100,
...     'max_depth': 5,
...     'learning_rate': 0.1,
...     'early_stopping_rounds': 20,
...     'eval_metric': 'AUC'
... }
>>> model = catboost_model(x_train, y_train, x_val, y_val, params)
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def catboost_model(x, y, valx, valy, params_dict, sample_weight=None):
    """训练CatBoost模型。

    使用训练集和验证集训练CatBoost模型,支持早停机制。

    Parameters
    ----------
    x : array-like or pd.DataFrame
        训练集特征
    y : array-like
        训练集标签
    valx : array-like or pd.DataFrame
        验证集特征
    valy : array-like
        验证集标签
    params_dict : dict
        CatBoost参数字典(支持 n_estimators / max_depth / random_state 别名)
    sample_weight : array-like, optional
        训练集样本权重

    Returns
    -------
    CatBoostClassifier
        训练好的CatBoost模型

    Examples
    --------
    >>> params = {
    ...     'n_estimators': 100,
    ...     'max_depth': 5,
    ...     'learning_rate': 0.1,
    ...     'early_stopping_rounds': 20,
    ...     'eval_metric': 'AUC'
    ... }
    >>> model = catboost_model(x_train, y_train, x_val, y_val, params)
    """
    CatBoostClassifier = _get_catboost()
    cb_params, early_stopping_rounds, eval_metric, cat_features = (
        _normalize_catboost_params(params_dict)
    )
    # eval_metric is folded into cb_params by _normalize_catboost_params because
    # CatBoost only accepts it as a constructor argument, not in fit().
    model = CatBoostClassifier(**cb_params)
    fit_kwargs = {
        'eval_set': (valx, valy),
        'verbose': cb_params.get('verbose', False),
    }
    if early_stopping_rounds is not None:
        fit_kwargs['early_stopping_rounds'] = early_stopping_rounds
    if cat_features is not None:
        fit_kwargs['cat_features'] = cat_features
    if sample_weight is not None:
        fit_kwargs['sample_weight'] = sample_weight
    model.fit(x, y, **fit_kwargs)
    return model

catboost_varimp

catboost_varimp(model)

获取CatBoost模型特征重要性。

返回按特征重要性排序的DataFrame。

参数:

名称 类型 描述 默认
model CatBoostClassifier

训练好的CatBoost模型

必需

返回:

类型 描述
DataFrame

包含 feature 和 importance 列的DataFrame,按重要性降序排列

示例:

>>> varimp = catboost_varimp(model)
>>> varimp.head(10)
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def catboost_varimp(model):
    """获取CatBoost模型特征重要性。

    返回按特征重要性排序的DataFrame。

    Parameters
    ----------
    model : CatBoostClassifier
        训练好的CatBoost模型

    Returns
    -------
    pd.DataFrame
        包含 feature 和 importance 列的DataFrame,按重要性降序排列

    Examples
    --------
    >>> varimp = catboost_varimp(model)
    >>> varimp.head(10)
    """
    feature_names = _extract_estimator_feature_names(model)
    if feature_names is None:
        feature_names = list(range(model.feature_count_))
    importance = model.get_feature_importance()
    varimp_df = pd.DataFrame({'feature': feature_names, 'importance': importance})
    varimp_df = varimp_df.sort_values('importance', ascending=False).reset_index(drop=True)
    return varimp_df

catboost_quick_train

catboost_quick_train(train_data, validation_data, x, y, params, wgt_col=None, val_wgt_col=None, cat_features=None)

快速训练CatBoost模型(使用DataFrame接口)。

接受DataFrame格式的训练集和验证集,自动提取特征和标签。

参数:

名称 类型 描述 默认
train_data DataFrame

训练数据集

必需
validation_data DataFrame

验证数据集

必需
x list of str

特征列名列表

必需
y str

目标变量列名

必需
params dict

CatBoost参数字典

必需
wgt_col str

样本权重列名

None
cat_features list

类别型特征列名或索引列表

None

返回:

类型 描述
CatBoostClassifier

训练好的CatBoost模型

示例:

>>> model = catboost_quick_train(
...     train_data=train_df,
...     validation_data=val_df,
...     x=['feat1', 'feat2'],
...     y='target',
...     params=params_dict
... )
源代码位于: Modeling_Tool/Model/GBM_Tool.py
def catboost_quick_train(train_data, validation_data, x, y, params, wgt_col=None,
                         val_wgt_col=None, cat_features=None):
    """快速训练CatBoost模型(使用DataFrame接口)。

    接受DataFrame格式的训练集和验证集,自动提取特征和标签。

    Parameters
    ----------
    train_data : pd.DataFrame
        训练数据集
    validation_data : pd.DataFrame
        验证数据集
    x : list of str
        特征列名列表
    y : str
        目标变量列名
    params : dict
        CatBoost参数字典
    wgt_col : str, optional
        样本权重列名
    cat_features : list, optional
        类别型特征列名或索引列表

    Returns
    -------
    CatBoostClassifier
        训练好的CatBoost模型

    Examples
    --------
    >>> model = catboost_quick_train(
    ...     train_data=train_df,
    ...     validation_data=val_df,
    ...     x=['feat1', 'feat2'],
    ...     y='target',
    ...     params=params_dict
    ... )
    """
    _get_catboost()

    params_dict = dict(params)
    if cat_features is not None:
        params_dict['cat_features'] = cat_features
    wgt = train_data[wgt_col] if wgt_col is not None else None
    model = catboost_model(
        x=train_data[x],
        y=train_data[y],
        valx=validation_data[x],
        valy=validation_data[y],
        params_dict=params_dict,
        sample_weight=wgt,
    )
    return model

后向变量消元 — Backward_Tool

Backward_Tool

向后变量消除工具包(统一版)

本模块提供基于LightGBM和XGBoost的向后变量消除(Backward Variable Elimination)功能, 通过累计特征重要性阈值进行变量筛选,并支持训练后的性能分析。

函数:

名称 描述
backward_lgbm

使用LightGBM模型进行向后变量消除

backward_xgbm

使用XGBoost模型进行向后变量消除

类:

名称 描述
BackwardVariableEliminator

向后变量消除器,支持LightGBM和XGBoost

BackwardEliminationAnalyzer

向后消除结果分析器

BackwardVariableEliminator

向后变量消除器。

封装LightGBM/XGBoost向后变量消除流程, 支持多轮消除和结果汇总。

参数:

名称 类型 描述 默认
train_data DataFrame

训练数据集

必需
varlist list of str

初始特征变量列表

必需
dep str

目标变量列名

必需
model_type str

模型类型,可选 "lgbm" 或 "xgbm"

"lgbm"
validation_data DataFrame

验证数据集

None
test_data_dict dict

测试数据集字典

None

示例:

>>> eliminator = BackwardVariableEliminator(
...     train_data=train_df,
...     varlist=feature_cols,
...     dep='target',
...     model_type='lgbm',
...     validation_data=val_df
... )
>>> results = eliminator.run(n_rounds=5)
源代码位于: Modeling_Tool/Model/Backward_Tool.py
class BackwardVariableEliminator:
    """
    向后变量消除器。

    封装LightGBM/XGBoost向后变量消除流程,
    支持多轮消除和结果汇总。

    Parameters
    ----------
    train_data : pd.DataFrame
        训练数据集
    varlist : list of str
        初始特征变量列表
    dep : str
        目标变量列名
    model_type : str, default "lgbm"
        模型类型,可选 "lgbm" 或 "xgbm"
    validation_data : pd.DataFrame, optional
        验证数据集
    test_data_dict : dict, optional
        测试数据集字典

    Examples
    --------
    >>> eliminator = BackwardVariableEliminator(
    ...     train_data=train_df,
    ...     varlist=feature_cols,
    ...     dep='target',
    ...     model_type='lgbm',
    ...     validation_data=val_df
    ... )
    >>> results = eliminator.run(n_rounds=5)
    """

    def __init__(
        self,
        train_data: pd.DataFrame,
        varlist: List[str],
        dep: str,
        model_type: str = "lgbm",
        validation_data: Optional[pd.DataFrame] = None,
        test_data_dict: Optional[Dict[str, pd.DataFrame]] = None,
        weight_col: Optional[str] = None,
        validation_weight_col: Optional[str] = None,
        wgt_col: Optional[str] = None,
    ):
        self.train_data = train_data
        self.varlist = varlist
        self.dep = dep
        self.model_type = model_type.lower()
        self.validation_data = validation_data
        self.test_data_dict = test_data_dict or {}
        self.weight_col = weight_col or wgt_col
        self.validation_weight_col = validation_weight_col
        self._results = []

    def run(
        self,
        n_rounds: int = 5,
        varreduct_params: Optional[Dict] = None,
        stopping_metric: str = "auc",
        seed: int = 42,
        num_boost_round: int = 200,
        early_stopping_rounds: int = 20,
        importance_type: str = "gain",
        cum_importance_threshold: float = 0.99,
        min_vars: int = 10,
        ret_perf: bool = True,
        nbins: int = 10,
        **kwargs,
    ) -> List[Dict]:
        """
        运行多轮向后变量消除。

        Parameters
        ----------
        n_rounds : int, default 5
            消除轮数
        varreduct_params : dict, optional
            模型超参数
        stopping_metric : str, default "auc"
            早停指标
        seed : int, default 42
            随机种子
        num_boost_round : int, default 200
            最大迭代轮数
        early_stopping_rounds : int, default 20
            早停轮数
        importance_type : str, default "gain"
            特征重要性类型
        cum_importance_threshold : float, default 0.99
            累计重要性阈值
        min_vars : int, default 10
            最小保留变量数
        ret_perf : bool, default True
            是否返回性能指标
        nbins : int, default 10
            分箱数

        Returns
        -------
        list of dict
            每轮消除结果列表
        """
        current_vars = self.varlist.copy()
        self._results = []

        backward_fn = backward_lgbm if self.model_type == "lgbm" else backward_xgbm

        for round_idx in range(1, n_rounds + 1):
            logging.info(f"[BackwardVariableEliminator] Round {round_idx}/{n_rounds}, vars={len(current_vars)}")

            result = backward_fn(
                train_data=self.train_data,
                varlist=current_vars,
                dep=self.dep,
                varreduct_params=copy.deepcopy(varreduct_params),
                stopping_metric=stopping_metric,
                seed=seed,
                num_boost_round=num_boost_round,
                early_stopping_rounds=early_stopping_rounds,
                importance_type=importance_type,
                cum_importance_threshold=cum_importance_threshold,
                min_vars=min_vars,
                validation_data=self.validation_data,
                test_data_dict=self.test_data_dict,
                ret_perf=ret_perf,
                nbins=nbins,
                weight_col=self.weight_col,
                validation_weight_col=self.validation_weight_col,
                **kwargs,
            )

            if ret_perf:
                selected_vars, model, perf_dict = result
            else:
                selected_vars, model = result
                perf_dict = {}

            round_result = {
                "round": round_idx,
                "n_vars_in": len(current_vars),
                "n_vars_out": len(selected_vars),
                "selected_vars": selected_vars,
                "model": model,
                "perf": perf_dict,
            }
            self._results.append(round_result)
            current_vars = selected_vars

            if len(current_vars) <= min_vars:
                logging.info(f"[BackwardVariableEliminator] Reached min_vars={min_vars}, stopping early.")
                break

        return self._results

    def get_final_vars(self) -> List[str]:
        """获取最终筛选后的变量列表。"""
        if not self._results:
            return self.varlist
        return self._results[-1]["selected_vars"]

    def get_summary(self) -> pd.DataFrame:
        """获取每轮消除汇总表。"""
        rows = []
        for r in self._results:
            rows.append({
                "round": r["round"],
                "n_vars_in": r["n_vars_in"],
                "n_vars_out": r["n_vars_out"],
                "vars_removed": r["n_vars_in"] - r["n_vars_out"],
            })
        return pd.DataFrame(rows)

run

run(n_rounds: int = 5, varreduct_params: Optional[Dict] = None, stopping_metric: str = 'auc', seed: int = 42, num_boost_round: int = 200, early_stopping_rounds: int = 20, importance_type: str = 'gain', cum_importance_threshold: float = 0.99, min_vars: int = 10, ret_perf: bool = True, nbins: int = 10, **kwargs) -> List[Dict]

运行多轮向后变量消除。

参数:

名称 类型 描述 默认
n_rounds int

消除轮数

5
varreduct_params dict

模型超参数

None
stopping_metric str

早停指标

"auc"
seed int

随机种子

42
num_boost_round int

最大迭代轮数

200
early_stopping_rounds int

早停轮数

20
importance_type str

特征重要性类型

"gain"
cum_importance_threshold float

累计重要性阈值

0.99
min_vars int

最小保留变量数

10
ret_perf bool

是否返回性能指标

True
nbins int

分箱数

10

返回:

类型 描述
list of dict

每轮消除结果列表

源代码位于: Modeling_Tool/Model/Backward_Tool.py
def run(
    self,
    n_rounds: int = 5,
    varreduct_params: Optional[Dict] = None,
    stopping_metric: str = "auc",
    seed: int = 42,
    num_boost_round: int = 200,
    early_stopping_rounds: int = 20,
    importance_type: str = "gain",
    cum_importance_threshold: float = 0.99,
    min_vars: int = 10,
    ret_perf: bool = True,
    nbins: int = 10,
    **kwargs,
) -> List[Dict]:
    """
    运行多轮向后变量消除。

    Parameters
    ----------
    n_rounds : int, default 5
        消除轮数
    varreduct_params : dict, optional
        模型超参数
    stopping_metric : str, default "auc"
        早停指标
    seed : int, default 42
        随机种子
    num_boost_round : int, default 200
        最大迭代轮数
    early_stopping_rounds : int, default 20
        早停轮数
    importance_type : str, default "gain"
        特征重要性类型
    cum_importance_threshold : float, default 0.99
        累计重要性阈值
    min_vars : int, default 10
        最小保留变量数
    ret_perf : bool, default True
        是否返回性能指标
    nbins : int, default 10
        分箱数

    Returns
    -------
    list of dict
        每轮消除结果列表
    """
    current_vars = self.varlist.copy()
    self._results = []

    backward_fn = backward_lgbm if self.model_type == "lgbm" else backward_xgbm

    for round_idx in range(1, n_rounds + 1):
        logging.info(f"[BackwardVariableEliminator] Round {round_idx}/{n_rounds}, vars={len(current_vars)}")

        result = backward_fn(
            train_data=self.train_data,
            varlist=current_vars,
            dep=self.dep,
            varreduct_params=copy.deepcopy(varreduct_params),
            stopping_metric=stopping_metric,
            seed=seed,
            num_boost_round=num_boost_round,
            early_stopping_rounds=early_stopping_rounds,
            importance_type=importance_type,
            cum_importance_threshold=cum_importance_threshold,
            min_vars=min_vars,
            validation_data=self.validation_data,
            test_data_dict=self.test_data_dict,
            ret_perf=ret_perf,
            nbins=nbins,
            weight_col=self.weight_col,
            validation_weight_col=self.validation_weight_col,
            **kwargs,
        )

        if ret_perf:
            selected_vars, model, perf_dict = result
        else:
            selected_vars, model = result
            perf_dict = {}

        round_result = {
            "round": round_idx,
            "n_vars_in": len(current_vars),
            "n_vars_out": len(selected_vars),
            "selected_vars": selected_vars,
            "model": model,
            "perf": perf_dict,
        }
        self._results.append(round_result)
        current_vars = selected_vars

        if len(current_vars) <= min_vars:
            logging.info(f"[BackwardVariableEliminator] Reached min_vars={min_vars}, stopping early.")
            break

    return self._results

get_final_vars

get_final_vars() -> List[str]

获取最终筛选后的变量列表。

源代码位于: Modeling_Tool/Model/Backward_Tool.py
def get_final_vars(self) -> List[str]:
    """获取最终筛选后的变量列表。"""
    if not self._results:
        return self.varlist
    return self._results[-1]["selected_vars"]

get_summary

get_summary() -> DataFrame

获取每轮消除汇总表。

源代码位于: Modeling_Tool/Model/Backward_Tool.py
def get_summary(self) -> pd.DataFrame:
    """获取每轮消除汇总表。"""
    rows = []
    for r in self._results:
        rows.append({
            "round": r["round"],
            "n_vars_in": r["n_vars_in"],
            "n_vars_out": r["n_vars_out"],
            "vars_removed": r["n_vars_in"] - r["n_vars_out"],
        })
    return pd.DataFrame(rows)

BackwardEliminationAnalyzer

向后消除结果分析器。

对 BackwardVariableEliminator 的运行结果进行分析和可视化。

参数:

名称 类型 描述 默认
results list of dict

BackwardVariableEliminator.run() 的返回值

必需

示例:

>>> analyzer = BackwardEliminationAnalyzer(results)
>>> analyzer.plot_var_reduction()
>>> final_vars = analyzer.get_stable_vars(top_n=20)
源代码位于: Modeling_Tool/Model/Backward_Tool.py
class BackwardEliminationAnalyzer:
    """
    向后消除结果分析器。

    对 BackwardVariableEliminator 的运行结果进行分析和可视化。

    Parameters
    ----------
    results : list of dict
        BackwardVariableEliminator.run() 的返回值

    Examples
    --------
    >>> analyzer = BackwardEliminationAnalyzer(results)
    >>> analyzer.plot_var_reduction()
    >>> final_vars = analyzer.get_stable_vars(top_n=20)
    """

    def __init__(self, results: List[Dict]):
        self.results = results

    def get_stable_vars(self, top_n: Optional[int] = None) -> List[str]:
        """
        获取在所有轮次中均被保留的稳定变量。

        Parameters
        ----------
        top_n : int, optional
            返回前N个稳定变量,None表示返回全部

        Returns
        -------
        list of str
        """
        if not self.results:
            return []

        stable = set(self.results[0]["selected_vars"])
        for r in self.results[1:]:
            stable &= set(r["selected_vars"])

        stable_list = sorted(stable)
        if top_n is not None:
            stable_list = stable_list[:top_n]
        return stable_list

    def plot_var_reduction(
        self,
        figsize: Tuple[int, int] = (8, 4),
        save_path: Optional[str] = None,
    ) -> None:
        """
        绘制变量数量随消除轮次变化的折线图。

        Parameters
        ----------
        figsize : tuple, default (8, 4)
            图形尺寸
        save_path : str, optional
            图片保存路径,None表示直接显示
        """
        rounds = [r["round"] for r in self.results]
        n_vars = [r["n_vars_out"] for r in self.results]

        fig, ax = plt.subplots(figsize=figsize)
        ax.plot(rounds, n_vars, marker="o", linewidth=2, color="#4C72B0")
        ax.set_xlabel("消除轮次")
        ax.set_ylabel("保留变量数")
        ax.set_title("向后变量消除:变量数量变化")
        ax.xaxis.set_major_locator(mticker.MaxNLocator(integer=True))
        plt.tight_layout()

        if save_path:
            fig.savefig(save_path, dpi=150, bbox_inches="tight")
            plt.close(fig)
        else:
            plt.show()

    def get_perf_trend(self, dataset: str = "mdl", metric: str = "IV") -> pd.DataFrame:
        """
        获取指定数据集上性能指标随轮次变化趋势。

        Parameters
        ----------
        dataset : str, default "mdl"
            数据集名称,如 "mdl", "hd", "oot"
        metric : str, default "IV"
            性能指标列名

        Returns
        -------
        pd.DataFrame
        """
        rows = []
        for r in self.results:
            perf = r.get("perf", {})
            if dataset in perf and perf[dataset] is not None:
                val = perf[dataset].get(metric, None) if isinstance(perf[dataset], dict) else None
                rows.append({"round": r["round"], metric: val})
        return pd.DataFrame(rows)

get_stable_vars

get_stable_vars(top_n: Optional[int] = None) -> List[str]

获取在所有轮次中均被保留的稳定变量。

参数:

名称 类型 描述 默认
top_n int

返回前N个稳定变量,None表示返回全部

None

返回:

类型 描述
list of str
源代码位于: Modeling_Tool/Model/Backward_Tool.py
def get_stable_vars(self, top_n: Optional[int] = None) -> List[str]:
    """
    获取在所有轮次中均被保留的稳定变量。

    Parameters
    ----------
    top_n : int, optional
        返回前N个稳定变量,None表示返回全部

    Returns
    -------
    list of str
    """
    if not self.results:
        return []

    stable = set(self.results[0]["selected_vars"])
    for r in self.results[1:]:
        stable &= set(r["selected_vars"])

    stable_list = sorted(stable)
    if top_n is not None:
        stable_list = stable_list[:top_n]
    return stable_list

plot_var_reduction

plot_var_reduction(figsize: Tuple[int, int] = (8, 4), save_path: Optional[str] = None) -> None

绘制变量数量随消除轮次变化的折线图。

参数:

名称 类型 描述 默认
figsize tuple

图形尺寸

(8, 4)
save_path str

图片保存路径,None表示直接显示

None
源代码位于: Modeling_Tool/Model/Backward_Tool.py
def plot_var_reduction(
    self,
    figsize: Tuple[int, int] = (8, 4),
    save_path: Optional[str] = None,
) -> None:
    """
    绘制变量数量随消除轮次变化的折线图。

    Parameters
    ----------
    figsize : tuple, default (8, 4)
        图形尺寸
    save_path : str, optional
        图片保存路径,None表示直接显示
    """
    rounds = [r["round"] for r in self.results]
    n_vars = [r["n_vars_out"] for r in self.results]

    fig, ax = plt.subplots(figsize=figsize)
    ax.plot(rounds, n_vars, marker="o", linewidth=2, color="#4C72B0")
    ax.set_xlabel("消除轮次")
    ax.set_ylabel("保留变量数")
    ax.set_title("向后变量消除:变量数量变化")
    ax.xaxis.set_major_locator(mticker.MaxNLocator(integer=True))
    plt.tight_layout()

    if save_path:
        fig.savefig(save_path, dpi=150, bbox_inches="tight")
        plt.close(fig)
    else:
        plt.show()

get_perf_trend

get_perf_trend(dataset: str = 'mdl', metric: str = 'IV') -> DataFrame

获取指定数据集上性能指标随轮次变化趋势。

参数:

名称 类型 描述 默认
dataset str

数据集名称,如 "mdl", "hd", "oot"

"mdl"
metric str

性能指标列名

"IV"

返回:

类型 描述
DataFrame
源代码位于: Modeling_Tool/Model/Backward_Tool.py
def get_perf_trend(self, dataset: str = "mdl", metric: str = "IV") -> pd.DataFrame:
    """
    获取指定数据集上性能指标随轮次变化趋势。

    Parameters
    ----------
    dataset : str, default "mdl"
        数据集名称,如 "mdl", "hd", "oot"
    metric : str, default "IV"
        性能指标列名

    Returns
    -------
    pd.DataFrame
    """
    rows = []
    for r in self.results:
        perf = r.get("perf", {})
        if dataset in perf and perf[dataset] is not None:
            val = perf[dataset].get(metric, None) if isinstance(perf[dataset], dict) else None
            rows.append({"round": r["round"], metric: val})
    return pd.DataFrame(rows)

backward_lgbm

backward_lgbm(train_data, varlist: List[str], dep: str, varreduct_params=None, stopping_metric='auc', seed=42, num_boost_round: int = 200, early_stopping_rounds: int = 20, importance_type: str = 'gain', cum_importance_threshold: float = 0.99, min_vars: int = 10, validation_data=None, test_data_dict=None, ret_perf: bool = True, nbins: int = 10, precision: int = 5, min_bin_prop: float = 0.05, include_missing: bool = True, equal_freq: bool = True, ascending: bool = True, fillna: Optional[float] = None, spec_values: Optional[List] = None, weight_col: Optional[str] = None, validation_weight_col: Optional[str] = None, wgt_col=None)

使用LightGBM模型进行向后变量消除。

通过训练LightGBM模型并根据特征重要性累计阈值筛选变量, 实现向后变量消除(Backward Variable Elimination)。

参数:

名称 类型 描述 默认
train_data DataFrame

训练数据集,必须包含dep列和varlist中的所有特征列

必需
varlist list of str

参与建模的特征变量列表

必需
dep str

目标变量列名(0/1二元变量)

必需
varreduct_params dict

LightGBM超参数字典,未指定的必需参数将使用预设值

None
stopping_metric str

早停评估指标,可选 "auc", "binary_logloss" 等

"auc"
seed int

随机种子,保证可复现性

42
num_boost_round int

最大迭代轮数

200
early_stopping_rounds int

早停轮数,验证集指标连续N轮无提升则停止

20
importance_type str

特征重要性类型,可选 "gain", "split"

"gain"
cum_importance_threshold float

累计特征重要性阈值,筛选覆盖该比例重要性的最少特征

0.99
min_vars int

保留的最小变量数量

10
validation_data DataFrame

验证数据集,用于早停

None
test_data_dict dict

测试数据集字典,格式为 {名称: DataFrame}

None
ret_perf bool

是否返回模型性能指标

True
nbins int

增益表分箱数

10
precision int

数值精度

5
min_bin_prop float

最小分箱比例

0.05
include_missing bool

是否包含缺失值分箱

True
equal_freq bool

是否使用等频分箱

True
ascending bool

增益表是否升序排列

True
fillna float

缺失值填充值

None
spec_values list

特殊值列表

None

返回:

类型 描述
tuple

(selected_vars, model, perf_dict) 或 (selected_vars, model)

引发:

类型 描述
TypeError

当输入数据不是pandas.DataFrame格式时

ImportError

当lightgbm未安装时

示例:

>>> selected_vars, model, perf = backward_lgbm(
...     train_data=train_df,
...     varlist=feature_cols,
...     dep='target',
...     validation_data=val_df
... )
源代码位于: Modeling_Tool/Model/Backward_Tool.py
def backward_lgbm(
    train_data: pd.DataFrame,
    varlist: List[str],
    dep: str,
    varreduct_params: Optional[Dict] = None,
    stopping_metric: str = "auc",
    seed: int = 42,
    num_boost_round: int = 200,
    early_stopping_rounds: int = 20,
    importance_type: str = "gain",
    cum_importance_threshold: float = 0.99,
    min_vars: int = 10,
    validation_data: Optional[pd.DataFrame] = None,
    test_data_dict: Optional[Dict[str, pd.DataFrame]] = None,
    ret_perf: bool = True,
    nbins: int = 10,
    precision: int = 5,
    min_bin_prop: float = 0.05,
    include_missing: bool = True,
    equal_freq: bool = True,
    ascending: bool = True,
    fillna: Optional[float] = None,
    spec_values: Optional[List] = None,
    weight_col: Optional[str] = None,
    validation_weight_col: Optional[str] = None,
    wgt_col: Optional[str] = None,
) -> Tuple:
    """
    使用LightGBM模型进行向后变量消除。

    通过训练LightGBM模型并根据特征重要性累计阈值筛选变量,
    实现向后变量消除(Backward Variable Elimination)。

    Parameters
    ----------
    train_data : pd.DataFrame
        训练数据集,必须包含dep列和varlist中的所有特征列
    varlist : list of str
        参与建模的特征变量列表
    dep : str
        目标变量列名(0/1二元变量)
    varreduct_params : dict, optional
        LightGBM超参数字典,未指定的必需参数将使用预设值
    stopping_metric : str, default "auc"
        早停评估指标,可选 "auc", "binary_logloss" 等
    seed : int, default 42
        随机种子,保证可复现性
    num_boost_round : int, default 200
        最大迭代轮数
    early_stopping_rounds : int, default 20
        早停轮数,验证集指标连续N轮无提升则停止
    importance_type : str, default "gain"
        特征重要性类型,可选 "gain", "split"
    cum_importance_threshold : float, default 0.99
        累计特征重要性阈值,筛选覆盖该比例重要性的最少特征
    min_vars : int, default 10
        保留的最小变量数量
    validation_data : pd.DataFrame, optional
        验证数据集,用于早停
    test_data_dict : dict, optional
        测试数据集字典,格式为 {名称: DataFrame}
    ret_perf : bool, default True
        是否返回模型性能指标
    nbins : int, default 10
        增益表分箱数
    precision : int, default 5
        数值精度
    min_bin_prop : float, default 0.05
        最小分箱比例
    include_missing : bool, default True
        是否包含缺失值分箱
    equal_freq : bool, default True
        是否使用等频分箱
    ascending : bool, default True
        增益表是否升序排列
    fillna : float, optional
        缺失值填充值
    spec_values : list, optional
        特殊值列表

    Returns
    -------
    tuple
        (selected_vars, model, perf_dict) 或 (selected_vars, model)

    Raises
    ------
    TypeError
        当输入数据不是pandas.DataFrame格式时
    ImportError
        当lightgbm未安装时

    Examples
    --------
    >>> selected_vars, model, perf = backward_lgbm(
    ...     train_data=train_df,
    ...     varlist=feature_cols,
    ...     dep='target',
    ...     validation_data=val_df
    ... )
    """
    try:
        import lightgbm as lgb
    except ImportError:
        raise ImportError("请安装lightgbm: pip install lightgbm")

    if varreduct_params is None:
        varreduct_params = {}

    if test_data_dict is None:
        test_data_dict = {}

    if spec_values is None:
        spec_values = []

    datain_all = OrderedDict()
    datain_all["mdl"] = train_data
    if validation_data is not None:
        datain_all["hd"] = validation_data
    datain_all.update(test_data_dict)

    # 检查数据格式一致性
    try:
        for k, v in datain_all.items():
            assert isinstance(v, pd.DataFrame)
    except AssertionError:
        logging.warning("请提供pandas.DataFrame格式数据")
        raise TypeError("请提供pandas.DataFrame格式数据")

    # 预设参数(保证模型可复现性)
    hyperparams_preset = {
        "metric": stopping_metric,
        "seed": seed,
        "objective": "binary",
        "boosting_type": "gbdt",
        "num_threads": 8
    }

    # 补充缺失的必需参数
    lacked_params = [k for k in list(hyperparams_preset.keys()) if k not in list(varreduct_params.keys())]
    for param in lacked_params:
        varreduct_params[param] = hyperparams_preset[param]

    weight_col = weight_col or wgt_col

    # 构建 LightGBM 数据集
    train_weight = resolve_sample_weight(data=train_data, weight_col=weight_col, expected_len=len(train_data))
    lgb_train = lgb.Dataset(train_data[varlist], label=train_data[dep], weight=train_weight)

    if validation_data is not None:
        valid_weight = resolve_sample_weight(
            data=validation_data,
            weight_col=validation_weight_col or weight_col,
            expected_len=len(validation_data),
        )
        lgb_valid = lgb.Dataset(validation_data[varlist], label=validation_data[dep], weight=valid_weight)
        valid_sets = [lgb_valid]
        valid_names = ["hd"]
    else:
        valid_sets = [lgb_train]
        valid_names = ["mdl"]

    callbacks = [
        lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=False),
        lgb.log_evaluation(period=-1)
    ]

    # 训练模型
    model = lgb.train(
        params=varreduct_params,
        train_set=lgb_train,
        num_boost_round=num_boost_round,
        valid_sets=valid_sets,
        valid_names=valid_names,
        callbacks=callbacks
    )

    # 获取特征重要性并筛选变量
    importance_df = pd.DataFrame({
        "feature": model.feature_name(),
        "importance": model.feature_importance(importance_type=importance_type)
    }).sort_values("importance", ascending=False).reset_index(drop=True)

    importance_df["cum_importance"] = importance_df["importance"].cumsum() / importance_df["importance"].sum()

    # 筛选达到阈值的变量
    selected_idx = importance_df[importance_df["cum_importance"] <= cum_importance_threshold].index.tolist()

    # 确保至少保留 min_vars 个变量
    if len(selected_idx) < min_vars:
        selected_idx = list(range(min(min_vars, len(importance_df))))

    selected_vars = importance_df.loc[selected_idx, "feature"].tolist()

    if not ret_perf:
        return selected_vars, model

    # 计算性能
    score_col = "_lgbm_score"
    perf_dict = {}

    for name, df in datain_all.items():
        df_score = df.copy()
        df_score[score_col] = model.predict(df_score[varlist])
        perf_dict[name] = _backward_perf_summary(
            df_score,
            split_name=name,
            dep=dep,
            score_col=score_col,
            weight_col=weight_col,
            validation_weight_col=validation_weight_col,
            nbins=nbins,
            precision=precision,
            min_bin_prop=min_bin_prop,
            include_missing=include_missing,
            equal_freq=equal_freq,
        )

    return selected_vars, model, perf_dict

backward_xgbm

backward_xgbm(train_data, varlist: List[str], dep: str, varreduct_params=None, stopping_metric='auc', seed=42, num_boost_round: int = 200, early_stopping_rounds: int = 20, importance_type: str = 'gain', cum_importance_threshold: float = 0.99, min_vars: int = 10, validation_data=None, test_data_dict=None, ret_perf: bool = True, nbins: int = 10, precision: int = 5, min_bin_prop: float = 0.05, include_missing: bool = True, equal_freq: bool = True, ascending: bool = True, fillna: Optional[float] = None, spec_values: Optional[List] = None, monotone_constraints: Optional[Dict[str, int]] = None, weight_col: Optional[str] = None, validation_weight_col: Optional[str] = None, wgt_col=None)

使用XGBoost模型进行向后变量消除。

通过训练XGBoost模型并根据特征重要性累计阈值筛选变量, 实现向后变量消除(Backward Variable Elimination)。

参数:

名称 类型 描述 默认
train_data DataFrame

训练数据集,必须包含dep列和varlist中的所有特征列

必需
varlist list of str

参与建模的特征变量列表

必需
dep str

目标变量列名(0/1二元变量)

必需
varreduct_params dict

XGBoost超参数字典,未指定的必需参数将使用预设值

None
stopping_metric str

早停评估指标

"auc"
seed int

随机种子

42
num_boost_round int

最大迭代轮数

200
early_stopping_rounds int

早停轮数

20
importance_type str

特征重要性类型

"gain"
cum_importance_threshold float

累计特征重要性阈值

0.99
min_vars int

保留的最小变量数量

10
validation_data DataFrame

验证数据集

None
test_data_dict dict

测试数据集字典

None
ret_perf bool

是否返回性能指标

True
nbins int

增益表分箱数

10
precision int

数值精度

5
min_bin_prop float

最小分箱比例

0.05
include_missing bool

是否包含缺失值分箱

True
equal_freq bool

是否使用等频分箱

True
ascending bool

增益表是否升序排列

True
fillna float

缺失值填充值

None
spec_values list

特殊值列表

None
monotone_constraints dict

单调约束字典,格式为 {特征名: 1/-1}

None

返回:

类型 描述
tuple

(selected_vars, model, perf_dict) 或 (selected_vars, model)

引发:

类型 描述
TypeError

当输入数据不是pandas.DataFrame格式时

ImportError

当xgboost未安装时

示例:

>>> selected_vars, model, perf = backward_xgbm(
...     train_data=train_df,
...     varlist=feature_cols,
...     dep='target',
...     validation_data=val_df
... )
源代码位于: Modeling_Tool/Model/Backward_Tool.py
def backward_xgbm(
    train_data: pd.DataFrame,
    varlist: List[str],
    dep: str,
    varreduct_params: Optional[Dict] = None,
    stopping_metric: str = "auc",
    seed: int = 42,
    num_boost_round: int = 200,
    early_stopping_rounds: int = 20,
    importance_type: str = "gain",
    cum_importance_threshold: float = 0.99,
    min_vars: int = 10,
    validation_data: Optional[pd.DataFrame] = None,
    test_data_dict: Optional[Dict[str, pd.DataFrame]] = None,
    ret_perf: bool = True,
    nbins: int = 10,
    precision: int = 5,
    min_bin_prop: float = 0.05,
    include_missing: bool = True,
    equal_freq: bool = True,
    ascending: bool = True,
    fillna: Optional[float] = None,
    spec_values: Optional[List] = None,
    monotone_constraints: Optional[Dict[str, int]] = None,
    weight_col: Optional[str] = None,
    validation_weight_col: Optional[str] = None,
    wgt_col: Optional[str] = None,
) -> Tuple:
    """
    使用XGBoost模型进行向后变量消除。

    通过训练XGBoost模型并根据特征重要性累计阈值筛选变量,
    实现向后变量消除(Backward Variable Elimination)。

    Parameters
    ----------
    train_data : pd.DataFrame
        训练数据集,必须包含dep列和varlist中的所有特征列
    varlist : list of str
        参与建模的特征变量列表
    dep : str
        目标变量列名(0/1二元变量)
    varreduct_params : dict, optional
        XGBoost超参数字典,未指定的必需参数将使用预设值
    stopping_metric : str, default "auc"
        早停评估指标
    seed : int, default 42
        随机种子
    num_boost_round : int, default 200
        最大迭代轮数
    early_stopping_rounds : int, default 20
        早停轮数
    importance_type : str, default "gain"
        特征重要性类型
    cum_importance_threshold : float, default 0.99
        累计特征重要性阈值
    min_vars : int, default 10
        保留的最小变量数量
    validation_data : pd.DataFrame, optional
        验证数据集
    test_data_dict : dict, optional
        测试数据集字典
    ret_perf : bool, default True
        是否返回性能指标
    nbins : int, default 10
        增益表分箱数
    precision : int, default 5
        数值精度
    min_bin_prop : float, default 0.05
        最小分箱比例
    include_missing : bool, default True
        是否包含缺失值分箱
    equal_freq : bool, default True
        是否使用等频分箱
    ascending : bool, default True
        增益表是否升序排列
    fillna : float, optional
        缺失值填充值
    spec_values : list, optional
        特殊值列表
    monotone_constraints : dict, optional
        单调约束字典,格式为 {特征名: 1/-1}

    Returns
    -------
    tuple
        (selected_vars, model, perf_dict) 或 (selected_vars, model)

    Raises
    ------
    TypeError
        当输入数据不是pandas.DataFrame格式时
    ImportError
        当xgboost未安装时

    Examples
    --------
    >>> selected_vars, model, perf = backward_xgbm(
    ...     train_data=train_df,
    ...     varlist=feature_cols,
    ...     dep='target',
    ...     validation_data=val_df
    ... )
    """
    try:
        import xgboost as xgb
    except ImportError:
        raise ImportError("请安装xgboost: pip install xgboost")

    if varreduct_params is None:
        varreduct_params = {}

    if test_data_dict is None:
        test_data_dict = {}

    if spec_values is None:
        spec_values = []

    if monotone_constraints is None:
        monotone_constraints = {}

    # 构建单调约束向量
    mc_dict = {var: monotone_constraints.get(var, 0) for var in varlist}

    datain_all = OrderedDict()
    datain_all["mdl"] = train_data
    if validation_data is not None:
        datain_all["hd"] = validation_data
    datain_all.update(test_data_dict)

    # 检查数据格式一致性
    try:
        for k, v in datain_all.items():
            assert isinstance(v, pd.DataFrame)
    except AssertionError:
        logging.warning("请提供pandas.DataFrame格式数据")
        raise TypeError("请提供pandas.DataFrame格式数据")

    # 预设参数(保证模型可复现性)
    hyperparams_preset = {
        'eval_metric': stopping_metric,
        'tree_method': 'exact',
        'booster': 'gbtree',
        'seed': seed,
        'monotone_constraints': mc_dict
    }

    # 补充缺失的必需参数
    lacked_params = [k for k in list(hyperparams_preset.keys()) if k not in list(varreduct_params.keys())]
    for param in lacked_params:
        varreduct_params[param] = hyperparams_preset[param]

    weight_col = weight_col or wgt_col

    # 构建 XGBoost 数据集
    train_weight = resolve_sample_weight(data=train_data, weight_col=weight_col, expected_len=len(train_data))
    xgb_train = xgb.DMatrix(train_data[varlist], label=train_data[dep], weight=train_weight)

    evals = [(xgb_train, "mdl")]
    if validation_data is not None:
        valid_weight = resolve_sample_weight(
            data=validation_data,
            weight_col=validation_weight_col or weight_col,
            expected_len=len(validation_data),
        )
        xgb_valid = xgb.DMatrix(validation_data[varlist], label=validation_data[dep], weight=valid_weight)
        evals.append((xgb_valid, "hd"))

    # 训练模型
    evals_result = {}
    model = xgb.train(
        params=varreduct_params,
        dtrain=xgb_train,
        num_boost_round=num_boost_round,
        evals=evals,
        early_stopping_rounds=early_stopping_rounds,
        evals_result=evals_result,
        verbose_eval=False
    )

    # 获取特征重要性并筛选变量
    importance_raw = model.get_score(importance_type=importance_type)
    importance_df = pd.DataFrame(
        list(importance_raw.items()), columns=["feature", "importance"]
    ).sort_values("importance", ascending=False).reset_index(drop=True)

    # 补充重要性为0的特征
    missing_feats = [f for f in varlist if f not in importance_df["feature"].values]
    if missing_feats:
        zero_df = pd.DataFrame({"feature": missing_feats, "importance": 0.0})
        importance_df = pd.concat([importance_df, zero_df], ignore_index=True)

    importance_df["cum_importance"] = importance_df["importance"].cumsum() / (importance_df["importance"].sum() or 1)

    selected_idx = importance_df[importance_df["cum_importance"] <= cum_importance_threshold].index.tolist()

    if len(selected_idx) < min_vars:
        selected_idx = list(range(min(min_vars, len(importance_df))))

    selected_vars = importance_df.loc[selected_idx, "feature"].tolist()

    if not ret_perf:
        return selected_vars, model

    # 计算性能
    score_col = "_xgbm_score"
    perf_dict = {}

    for name, df in datain_all.items():
        df_score = df.copy()
        xgb_dmat = xgb.DMatrix(df_score[varlist])
        df_score[score_col] = model.predict(xgb_dmat)
        perf_dict[name] = _backward_perf_summary(
            df_score,
            split_name=name,
            dep=dep,
            score_col=score_col,
            weight_col=weight_col,
            validation_weight_col=validation_weight_col,
            nbins=nbins,
            precision=precision,
            min_bin_prop=min_bin_prop,
            include_missing=include_missing,
            equal_freq=equal_freq,
        )

    return selected_vars, model, perf_dict