from functools import reduce
from typing import Any
import polars as pl
from .metrics import compute_metrics
from .rule_selection import filter_correlated_rules
# Mapping from operator string to the corresponding Polars Series/Expr method name.
# Defined at module level to avoid reconstruction on every function call.
_OPS: dict[str, str] = {
">=": "__ge__",
">": "__gt__",
"<=": "__le__",
"<": "__lt__",
"==": "__eq__",
"!=": "__ne__",
}
[docs]
def apply_rules(X: pl.DataFrame, rules: list[str]) -> pl.DataFrame:
r"""Evaluate rule expressions on a DataFrame to produce boolean predictions.
Takes a list of rule strings (logical conditions) and evaluates them against
the input DataFrame to produce a boolean DataFrame where each column represents
the evaluation result of one rule.
Parameters
----------
X : pl.DataFrame
Input DataFrame on which to evaluate the rules. Must contain all columns
referenced in the rule expressions.
rules : list[str]
List of rule expressions as strings. Each rule should be a valid
Polars expression that evaluates to a boolean result.
Format: ``'(X["column_name"] operator value) [& \| \|] ...'``
Returns
-------
pl.DataFrame
DataFrame containing only the evaluated rule columns as boolean values.
Each column name matches the corresponding rule expression string.
Shape: (len(X), len(rules))
Examples
--------
>>> import polars as pl
>>> X = pl.DataFrame({"age": [25, 30, 35], "income": [50000, 60000, 70000]})
>>> rules = ['(X["age"] >= 30)', '(X["income"] > 55000)']
>>> R = apply_rules(X, rules)
>>> R.columns
['(X["age"] >= 30)', '(X["income"] > 55000)']
>>> R
shape: (3, 2)
┌──────────────────┬─────────────────────┐
│ (X["age"] >= 30) │ (X["income"] > ... │
│ --- │ --- │
│ bool │ bool │
╞══════════════════╪═════════════════════╡
│ false │ false │
│ true │ true │
│ true │ true │
└──────────────────┴─────────────────────┘
Notes
-----
Uses Python's `eval()` with a restricted namespace for security.
The namespace includes only `pl` (Polars) and `X` (the input DataFrame).
"""
# Provide explicit namespace for eval to ensure cross-platform compatibility
namespace = {"pl": pl, "X": X}
exprs = [eval(rule, namespace).alias(rule) for rule in rules]
return X.with_columns(exprs).select(rules)
[docs]
def select_diverse_top_rules(
R: pl.DataFrame,
metrics: pl.DataFrame,
max_corr: float = 0.8,
importance_metric: str = "f0.5",
top_n: int | None = None,
ranking_metric: str = "f1",
) -> tuple[pl.DataFrame, pl.DataFrame, list[str]]:
"""Select top performing rules while removing highly correlated duplicates.
Ranks rules by a performance metric, optionally limits to top N rules,
then filters out correlated rules to produce a diverse set of high-quality
rules. When rules are correlated, keeps the one with higher importance.
Parameters
----------
R : pl.DataFrame
Boolean DataFrame with rule predictions (columns are rules, rows are samples).
metrics : pl.DataFrame
Performance metrics for the rules in R.
Must contain columns 'rule', importance_metric, and ranking_metric metric.
max_corr : float, default=0.8
Maximum correlation threshold. Rule pairs with correlation > max_corr
are considered too similar, and only the more important one is kept.
importance_metric : str, default="f0.5"
Metric name to use for determining rule importance when filtering
correlated rules. Higher values indicate more important rules.
top_n : int | None, default=None
If specified, limits selection to top N rules by ranking_metric metric
before filtering correlations. If None, considers all rules.
ranking_metric : str, default="f1"
Metric name to sort and rank rules by (descending order) before
applying correlation filtering.
Returns
-------
R_filtered : pl.DataFrame
Boolean DataFrame containing only the selected uncorrelated rules.
metrics_filtered : pl.DataFrame
Performance metrics for the selected rules only.
selected_rule_list : list[str]
List of selected rule expressions (column names from R_filtered).
Examples
--------
>>> R = pl.DataFrame({
... "rule_A": [True, False, True, False],
... "rule_B": [True, False, True, True],
... "rule_C": [False, True, False, True]
... })
>>> metrics = pl.DataFrame({
... "rule": ["rule_A", "rule_B", "rule_C"],
... "f1": [0.8, 0.75, 0.6],
... "f0.5": [0.85, 0.78, 0.65]
... })
>>> R_filtered, metrics_filtered, rules = select_diverse_top_rules(
... R, metrics, max_corr=0.9, top_n=2
... )
>>> rules # ['rule_A', 'rule_C'] - rule_B removed due to correlation with rule_A
See Also
--------
filter_correlated_rules : Filter correlated boolean columns by importance
"""
# Guard against empty input
if R.is_empty() or metrics.is_empty() or "rule" not in metrics.columns:
print("Number of uncorrelated rules: 0")
return pl.DataFrame(), pl.DataFrame(), []
# Sort and optionally limit to top N
metrics_sorted = metrics.sort(ranking_metric, descending=True)
if top_n is not None:
metrics_sorted = metrics_sorted[:top_n]
# Filter correlated rules — returns list[str]
importance_dict = dict(metrics_sorted[["rule", importance_metric]].rows())
selected_rules = filter_correlated_rules(
R[metrics_sorted["rule"].to_list()], max_corr=max_corr, importance=importance_dict
)
R_filtered = R[selected_rules] if selected_rules else pl.DataFrame()
print(f"Number of uncorrelated rules: {len(selected_rules)}")
return R_filtered, metrics_sorted.filter(pl.col("rule").is_in(selected_rules)), selected_rules
[docs]
def apply_filter_and_deduplicate_rules(
X: pl.DataFrame,
y: pl.Series,
rules: list[str] | pl.DataFrame,
weight_column: str | None = None,
metric_thresholds: list[dict[str, Any]] | None = None,
top_n_rules: int | None = None,
max_corr: float = 0.8,
ranking_metric: str = "precision",
) -> tuple[pl.DataFrame, pl.DataFrame, list[str]]:
"""Complete pipeline to evaluate and filter rules on a dataset.
Performs a comprehensive rule evaluation workflow:
1. Evaluates rules on the dataset and filters by performance thresholds
2. Selects top N uncorrelated rules based on performance
3. Returns predictions, metrics, and selected rule list
Parameters
----------
X : pl.DataFrame
Feature DataFrame on which to evaluate rules.
y : pl.Series
Target series (boolean or binary).
rules : list[str] | pl.DataFrame
Either a list of rule expressions as strings, or a DataFrame with a 'rule' column.
weight_column : str | None, default=None
Name of weight column in X. If None, unweighted metrics are computed.
metric_thresholds : list[dict[str, Any]], default=[precision >= 0.2, recall >= 0.2]
List of threshold dicts forwarded to :func:`apply_and_filter_by_performance`.
Each dict must have keys ``"name"``, ``"operator"``, and ``"value"``.
All conditions are combined with AND.
top_n_rules : int | None, default=None
Maximum number of rules to keep after sorting by ranking_metric metric.
Applied before correlation filtering. If None, keeps all rules that pass thresholds.
max_corr : float, default=0.8
Maximum correlation threshold for filtering similar rules.
ranking_metric : str, default="precision"
Metric name to sort results by (descending order).
Returns
-------
R : pl.DataFrame
Boolean DataFrame with selected rule predictions.
Shape: (len(X), n_selected_rules)
metrics : pl.DataFrame
Performance metrics for selected rules.
selected_rules : list[str]
List of selected rule expressions that passed all filters.
Examples
--------
>>> import polars as pl
>>> # Combine train and test data
>>> X = pl.concat([X_train, X_test])
>>> y = pl.concat([y_train, y_test])
>>> rules = ['(X["age"] >= 30)', '(X["income"] > 55000)']
>>> thresholds = [{"name": "precision", "operator": ">=", "value": 0.5},
... {"name": "recall", "operator": ">=", "value": 0.5}]
>>> R, metrics, selected_rules = apply_filter_and_deduplicate_rules(
... X, y, rules, metric_thresholds=thresholds, top_n_rules=10
... )
>>> print(f"Selected {len(selected_rules)} rules")
>>> metrics[['rule', 'precision', 'recall', 'f1']]
Notes
-----
The function applies filtering in sequence:
- Threshold filtering removes low-performing rules
- Top-N selection limits the rule set size (optional)
- Correlation filtering ensures diversity in the final rule set
See Also
--------
apply_and_filter_by_performance : Evaluate rules on a single data split
select_diverse_top_rules : Select top rules while removing correlations
"""
if metric_thresholds is None:
metric_thresholds = [
{"name": "precision", "operator": ">=", "value": 0.2},
{"name": "recall", "operator": ">=", "value": 0.2},
]
# Convert rules_df to list if needed
if isinstance(rules, pl.DataFrame):
rules = rules.select("rule").to_series().to_list()
# Evaluate and filter by thresholds
R, metrics = apply_and_filter_by_performance(
X, y, rules, weight_column, metric_thresholds, ranking_metric
)
# Select top uncorrelated rules
R, metrics, selected_rules = select_diverse_top_rules(
R, metrics, max_corr=max_corr, top_n=top_n_rules, ranking_metric=ranking_metric
)
return R, metrics, selected_rules