Source code for iguanas.rule_classifier

from __future__ import annotations

from typing import Any

import numpy as np
import polars as pl
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, field_validator
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import NotFittedError
from xgboost import XGBClassifier

from .rule_evaluation import apply_and_filter_by_performance, apply_rules
from .rule_generation import rule_grid_search

_NUMERIC_DTYPES = (pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.Float32, pl.Float64)



[docs]
class RuleClassifier(BaseModel, BaseEstimator, ClassifierMixin):
    """Rule-based classifier that selects the single best rule.

    The best rule is selected through the following steps:

    1. **Rule generation**: candidate rules are extracted from XGBoost decision
       trees trained across a sweep of ``scale_pos_weight`` values.
    2. **Performance filtering**: rules that fail any condition in
       ``metric_thresholds`` are discarded.
    3. **Ranking**: the surviving rules are sorted by ``ranking_metric`` (descending)
       and the top-ranked rule is stored in ``_best_rule_``.

    Parameters
    ----------
    estimator : XGBClassifier
        XGBoost classifier used for rule generation.
    scale_pos_weights : list[float] | np.ndarray, default=np.array([1.0])
        Array of scale_pos_weight values swept during rule generation.
    sample_weights_df : pl.DataFrame | None, default=None
        DataFrame of sample weights used for rule generation.
    ranking_metric : str, default="accuracy"
        Metric used to rank candidate rules. The single highest-scoring rule
        is kept. Must be a column produced by compute_metrics.
    metric_thresholds : list[dict[str, Any]] | None, default=None
        List of threshold dicts used to filter candidate rules. Each dict must
        have keys ``"name"`` (metric column), ``"operator"`` (one of
        ``">="``, ``">"``, ``"<="``, ``"<"``, ``"=="``, ``"!="``), and
        ``"value"`` (numeric threshold). All conditions are combined with AND.
        If None, the default threshold of ``apply_and_filter_by_performance``
        is used.
    """

    model_config = ConfigDict(arbitrary_types_allowed=True)

    estimator: XGBClassifier
    scale_pos_weights: list[float] | np.ndarray = Field(default_factory=lambda: np.array([1.0]))
    sample_weights_df: pl.DataFrame | None = None
    ranking_metric: str = "accuracy"
    metric_thresholds: list[dict[str, Any]] | None = None

    @field_validator("metric_thresholds")
    @classmethod
    def _check_metric_thresholds(
        cls, v: list[dict[str, Any]] | None
    ) -> list[dict[str, Any]] | None:
        if v is None:
            return v
        for t in v:
            val = t.get("value")
            if val is not None and not (0.0 <= val <= 1.0):
                raise ValueError(
                    f"metric_thresholds value {val!r} is out of range [0, 1] for threshold {t!r}"
                )
        return v

    # Learned attributes (set by fit, not part of the model schema)
    _feature_cols_: list[str] = PrivateAttr(default_factory=list)
    _best_rule_: str = PrivateAttr(default="")


[docs]
    def fit(self, X: pl.DataFrame, y: pl.Series) -> RuleClassifier:
        """Generate, filter, and select the single best rule from training data.

        Parameters
        ----------
        X : pl.DataFrame
            Feature DataFrame. Only numeric columns are used for rule generation.
        y : pl.Series
            Binary target series.

        Returns
        -------
        RuleClassifier
            Fitted classifier instance (self).
        """
        self._feature_cols_ = [c for c, dt in X.schema.items() if dt in _NUMERIC_DTYPES]

        rules = rule_grid_search(
            self.estimator,
            X[self._feature_cols_].to_pandas(),
            y.to_pandas(),
            scale_pos_weights=self.scale_pos_weights,
            sample_weights_df=self.sample_weights_df,
        )
        if rules.is_empty():
            self._best_rule_ = ""
            return self
        rules = rules.unique("rule")

        _, M = apply_and_filter_by_performance(
            X[self._feature_cols_],
            y,
            rules["rule"].to_list(),
            metric_thresholds=self.metric_thresholds,
            ranking_metric=self.ranking_metric,
        )

        self._best_rule_ = M["rule"].item(0) if M.height > 0 else ""
        return self


    def _check_is_fitted(self) -> None:
        if not self._feature_cols_:
            raise NotFittedError(
                f"This {type(self).__name__} instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this estimator."
            )


[docs]
    def predict(self, X: pl.DataFrame) -> pl.Series:
        """Predict binary labels using the single best rule.

        Parameters
        ----------
        X : pl.DataFrame
            Feature DataFrame with the same columns seen during fit.

        Returns
        -------
        pl.Series
            Boolean series named "prediction".
        """
        self._check_is_fitted()
        if not self._best_rule_:
            return pl.Series(self._best_rule_, [False] * X.height, dtype=pl.Boolean)

        R = apply_rules(X[self._feature_cols_], [self._best_rule_])
        return R[self._best_rule_]



[docs]
    def predict_proba(self, X: pl.DataFrame) -> pl.Series:
        """Predict probability using the single best rule.

        - Rule fires  → 1.0
        - Rule does not fire → 0.0

        Parameters
        ----------
        X : pl.DataFrame
            Feature DataFrame with the same columns seen during fit.

        Returns
        -------
        pl.Series
            Float64 series named "proba" with values in {0.0, 1.0}.
        """
        return self.predict(X).cast(pl.Float64)



[docs]
    def fit_predict(self, X: pl.DataFrame, y: pl.Series) -> pl.Series:
        """Fit classifier and return binary predictions on the same data."""
        return self.fit(X, y).predict(X)