import polars as pl
[docs]
def compute_single_metric(
y_pred: pl.Series,
y: pl.Series,
metric: str,
weights: pl.Series | None = None,
) -> float:
"""Compute a single performance metric for one boolean prediction series.
Faster than compute_metrics when only one scalar is needed, because it
skips computing all 25+ derived metrics. Used internally by
combine_rules_beam_search during candidate evaluation.
Parameters
----------
y_pred : pl.Series
Boolean prediction series.
y : pl.Series
Boolean target series.
metric : str
Metric name: "precision", "recall", "accuracy", or an F-beta score (f<number>).
weights : pl.Series | None, default=None
Optional sample weights. When provided, all counts use weighted sums.
Returns
-------
float
The requested metric value.
"""
y_bool = y.cast(pl.Boolean)
y_pred_bool = y_pred.cast(pl.Boolean)
if weights is not None:
TP = float(weights.filter(y_bool & y_pred_bool).sum())
FP = float(weights.filter(~y_bool & y_pred_bool).sum())
FN = float(weights.filter(y_bool & ~y_pred_bool).sum())
else:
TP = float((y_bool & y_pred_bool).sum())
FP = float((~y_bool & y_pred_bool).sum())
FN = float((y_bool & ~y_pred_bool).sum())
if metric == "precision":
return TP / (TP + FP) if (TP + FP) > 0 else 0.0
if metric == "recall":
return TP / (TP + FN) if (TP + FN) > 0 else 0.0
if metric == "accuracy":
TN = (
float((~y_bool & ~y_pred_bool).sum())
if weights is None
else float(weights.filter(~y_bool & ~y_pred_bool).sum())
)
return (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0.0
if metric.startswith("f"):
beta = float(metric[1:])
precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
denom = beta**2 * precision + recall
return (1 + beta**2) * precision * recall / denom if denom > 0 else 0.0
raise ValueError(
f"Unsupported metric '{metric}'. Must be 'precision', 'recall', "
f"'accuracy', or an F-beta score (f<number>)."
)
[docs]
def compute_metrics(
R: pl.Series | pl.DataFrame,
y: pl.Series,
weights: pl.Series | None = None,
betas: list[float] | None = None,
) -> pl.DataFrame:
"""Compute comprehensive performance metrics for all rule columns.
Calculates confusion matrix, precision, recall, F-beta scores, and TPVE metrics
for each rule. Optionally computes weighted versions of all metrics.
Parameters
----------
R : pl.DataFrame
DataFrame with boolean columns representing rule predictions. Each column
is a rule that evaluates to True/False for each observation.
y : pl.Series
Boolean target series indicating true labels (True for positive class).
Will be cast to Boolean if not already.
weights : pl.Series | None, default=None
Optional numeric series for weighted metrics computation. If provided,
computes both count-based and weighted versions of all metrics.
betas : list[float], default=[0.25, 0.5, 1, 1.5, 2]
F-beta values to compute. Each value ``b`` produces a column named
``f{b}`` (and ``f{b}_weight`` when *weights* is provided).
Returns
-------
pl.DataFrame
DataFrame with one row per rule containing:
- rule: Rule name (column name from R)
- TP, FP, TN, FN: Confusion matrix counts
- precision, recall, accuracy: Standard classification metrics
- flagged(%): Percentage of total flagged as positive
- good_flagged(%): Percentage of negatives flagged as positive
- f{b} for each b in *betas*: F-beta scores
- num_rules: Number of individual rules y_pred (1 for single rules)
If weights is provided, additional columns with "_weight" suffix:
- TP_weight, FP_weight, TN_weight, FN_weight: Weighted confusion matrix
- total_weight, precision_weight, recall_weight, accuracy_weight: Weighted versions
- f{b}_weight for each b in *betas*: Weighted F-beta scores
Examples
--------
>>> import polars as pl
>>> # Count-based metrics only
>>> metrics_df = compute_metrics(R, y, weights=None)
>>>
>>> # Both count and weighted metrics
>>> metrics_df = compute_metrics(R, y, weights=transaction_amounts)
>>>
>>> # Sort by TPVE3 to find best rules
>>> top_rules = metrics_df.sort("TPVE3", descending=True).head(10)
"""
if betas is None:
betas = [0.25, 0.5, 1, 1.5, 2]
if y.dtype != pl.Boolean:
y = y.cast(pl.Boolean)
if isinstance(R, pl.Series):
R = R.to_frame()
# Compute confusion matrix for all columns
if weights is not None:
# Both count and weighted metrics
metrics_df = pl.DataFrame(
{
"rule": R.columns,
"TP": [(y & R[col]).sum() for col in R.columns],
"FP": [(~y & R[col]).sum() for col in R.columns],
"TN": [(~y & ~R[col]).sum() for col in R.columns],
"FN": [(y & ~R[col]).sum() for col in R.columns],
"TP_weight": [(weights.filter(y & R[col])).sum() for col in R.columns],
"FP_weight": [(weights.filter(~y & R[col])).sum() for col in R.columns],
"TN_weight": [(weights.filter(~y & ~R[col])).sum() for col in R.columns],
"FN_weight": [(weights.filter(y & ~R[col])).sum() for col in R.columns],
}
)
else:
# Only count metrics
metrics_df = pl.DataFrame(
{
"rule": R.columns,
"TP": [(y & R[col]).sum() for col in R.columns],
"FP": [(~y & R[col]).sum() for col in R.columns],
"TN": [(~y & ~R[col]).sum() for col in R.columns],
"FN": [(y & ~R[col]).sum() for col in R.columns],
}
)
# Step 1: Add basic metrics (precision, recall, and accuracy)
metrics_df = metrics_df.with_columns(
[
(pl.col("TP") / (pl.col("TP") + pl.col("FP"))).alias("precision"),
(pl.col("TP") / (pl.col("TP") + pl.col("FN"))).alias("recall"),
(
(pl.col("TP") + pl.col("TN"))
/ (pl.col("TP") + pl.col("FP") + pl.col("TN") + pl.col("FN"))
).alias("accuracy"),
]
)
# Step 2: Build complete list of all derived metrics that depend on precision/recall
expressions = [
(
(pl.col("TP") + pl.col("FP"))
/ (pl.col("TP") + pl.col("FP") + pl.col("TN") + pl.col("FN"))
* 100
).alias("flagged(%)"),
(pl.col("FP") / (pl.col("TN") + pl.col("FP")) * 100).alias("good_flagged(%)"),
*[
(
(1 + b**2)
* pl.col("precision")
* pl.col("recall")
/ (b**2 * pl.col("precision") + pl.col("recall"))
).alias(f"f{b:g}")
for b in betas
],
# Number of rules
(pl.col("rule").str.count_matches(r"\) \| \(") + 1).alias("num_rules"),
]
if weights is not None:
# First compute total_weight
metrics_df = metrics_df.with_columns(
[
(
pl.col("TP_weight")
+ pl.col("FP_weight")
+ pl.col("TN_weight")
+ pl.col("FN_weight")
).alias("total_weight"),
]
)
# Then compute precision, recall, and accuracy using total_weight
metrics_df = metrics_df.with_columns(
[
(pl.col("TP_weight") / (pl.col("TP_weight") + pl.col("FP_weight"))).alias(
"precision_weight"
),
(pl.col("TP_weight") / (pl.col("TP_weight") + pl.col("FN_weight"))).alias(
"recall_weight"
),
((pl.col("TP_weight") + pl.col("TN_weight")) / pl.col("total_weight")).alias(
"accuracy_weight"
),
]
)
expressions.extend(
[
*[
(
(1 + b**2)
* pl.col("precision_weight")
* pl.col("recall_weight")
/ (b**2 * pl.col("precision_weight") + pl.col("recall_weight"))
).alias(f"f{b:g}_weight")
for b in betas
],
]
)
metrics_df = metrics_df.with_columns(expressions)
return metrics_df