Titanic Survival Prediction using Iguanas#

This notebook demonstrates a complete end-to-end example of using Iguanas for rule-based classification on the Kaggle Titanic dataset.

The workflow includes:

Generating the best rule based on the given metric
Generating the best ruleset based on the given metric

1. Import Libraries#

[1]:

import numpy as np
import polars as pl
from xgboost import XGBClassifier

from iguanas.metrics import compute_metrics
from iguanas.rule_analysis import generate_rule_performance_report
from iguanas.rule_classifier import RuleClassifier
from iguanas.ruleset_classifier import RulesetClassifier

2. Load#

Load the Titanic training data and separate features from the target variable (Survived).

[2]:

train = pl.read_csv("../../../../../kaggle/titanic/train.csv").drop("PassengerId")
X_train = train.drop("Survived")
y_train = train["Survived"]

3. Generate best rule based on given metric#

[3]:

estimator = XGBClassifier(n_estimators=10, max_depth=4, eval_metric="logloss", random_state=0)
rule_est = RuleClassifier(
    estimator=estimator,
    scale_pos_weights=np.logspace(-3, 3, 10),
)
_ = rule_est.fit(X_train, y_train)
y_pred_train = rule_est.predict(X_train)

[4]:

M_train = compute_metrics(y_pred_train, y_train)
M_train

[4]:

shape: (1, 16)

rule	TP	FP	TN	FN	precision	recall	accuracy	flagged(%)	good_flagged(%)	f0.25	f0.5	f1	f1.5	f2	num_rules
str	i64	i64	i64	i64	f64	f64	f64	f64	f64	f64	f64	f64	f64	f64	u32
"(X["Pclass"] < 3.0) & (X["Fare…	176	91	443	150	0.659176	0.539877	0.719767	31.046512	17.041199	0.650718	0.631277	0.593592	0.571714	0.560153	1

[5]:

report = generate_rule_performance_report(rule_est._best_rule_, X_train, y_train)
report

[5]:

shape: (4, 17)

rule_index	rule	TP	FP	TN	FN	precision	recall	accuracy	flagged(%)	good_flagged(%)	f0.25	f0.5	f1	f1.5	f2	num_rules
str	str	i64	i64	i64	i64	f64	f64	f64	f64	f64	f64	f64	f64	f64	f64	u32
"0"	"(X["Pclass"] < 3.0) & (X["Fare…	176	91	443	150	0.659176	0.539877	0.719767	31.046512	17.041199	0.650718	0.631277	0.593592	0.571714	0.560153	1
"0.0"	"(X['Pclass'] < 3.0)"	223	177	372	119	0.5575	0.652047	0.667789	44.893378	32.240437	0.562296	0.57415	0.601078	0.619709	0.630656	1
"0.1"	"(X['Fare'] >= 13.7917)"	234	224	325	108	0.510917	0.684211	0.627385	51.402918	40.801457	0.518644	0.538178	0.585	0.619552	0.640745	1
"0.2"	"(X['Age'] < 64.0)"	289	412	12	1	0.412268	0.996552	0.421569	98.179272	97.169811	0.426995	0.467033	0.583249	0.693942	0.776464	1

4. Generate best ruleset based on given metric#

[6]:

estimator = XGBClassifier(n_estimators=10, max_depth=4, eval_metric="logloss", random_state=0)
ruleset_est = RulesetClassifier(
    estimator=estimator,
    scale_pos_weights=np.logspace(-3, 3, 50),
    opt_metric="accuracy",
    metric_thresholds=[{"name": "accuracy", "operator": ">=", "value": 0.5}],
    max_rules=5,
)
_ = ruleset_est.fit(X_train, y_train)
y_pred_train = ruleset_est.predict(X_train)

[7]:

M_train = compute_metrics(y_pred_train, y_train)
M_train

[7]:

shape: (1, 16)

rule	TP	FP	TN	FN	precision	recall	accuracy	flagged(%)	good_flagged(%)	f0.25	f0.5	f1	f1.5	f2	num_rules
str	i64	i64	i64	i64	f64	f64	f64	f64	f64	f64	f64	f64	f64	f64	u32
"((X["Pclass"] < 3.0) & (X["Far…	193	92	423	117	0.677193	0.622581	0.746667	34.545455	17.864078	0.673717	0.665517	0.648739	0.638422	0.632787	2

[8]:

report = generate_rule_performance_report(ruleset_est._best_ruleset_, X_train, y_train)
report

[8]:

shape: (9, 17)

rule_index	rule	TP	FP	TN	FN	precision	recall	accuracy	flagged(%)	good_flagged(%)	f0.25	f0.5	f1	f1.5	f2	num_rules
str	str	i64	i64	i64	i64	f64	f64	f64	f64	f64	f64	f64	f64	f64	f64	u32
"0"	"((X["Pclass"] < 3.0) & (X["Far…	193	92	423	117	0.677193	0.622581	0.746667	34.545455	17.864078	0.673717	0.665517	0.648739	0.638422	0.632787	2
"0.0"	"(X['Pclass'] < 3.0) & (X['Fare…	174	87	447	152	0.666667	0.533742	0.722093	30.348837	16.292135	0.657041	0.635036	0.592845	0.568627	0.555911	1
"0.1"	"(X['Fare'] >= 11.1333) & (X['S…	43	6	509	267	0.877551	0.13871	0.669091	5.939394	1.165049	0.66819	0.424901	0.239554	0.187207	0.166796	1
"0.0.0"	"(X['Pclass'] < 3.0)"	223	177	372	119	0.5575	0.652047	0.667789	44.893378	32.240437	0.562296	0.57415	0.601078	0.619709	0.630656	1
"0.0.1"	"(X['Fare'] >= 13.7917)"	234	224	325	108	0.510917	0.684211	0.627385	51.402918	40.801457	0.518644	0.538178	0.585	0.619552	0.640745	1
"0.0.2"	"(X['Age'] < 61.0)"	285	407	17	5	0.41185	0.982759	0.422969	96.918768	95.990566	0.426421	0.465991	0.580448	0.688918	0.769438	1
"0.1.0"	"(X['Fare'] >= 11.1333)"	266	261	288	76	0.504744	0.777778	0.621773	59.147026	47.540984	0.515386	0.542857	0.612198	0.666795	0.701847	1
"0.1.1"	"(X['SibSp'] < 3.0)"	335	510	39	7	0.39645	0.979532	0.419753	94.837262	92.896175	0.410835	0.450027	0.564448	0.674357	0.756891	1
"0.1.2"	"(X['Age'] < 16.0)"	49	34	390	241	0.590361	0.168966	0.614846	11.62465	8.018868	0.514833	0.393891	0.262735	0.216519	0.197104	1

7. Generate Predictions on Test Data#

Apply the best ruleset to the test data and create a submission file:

[9]:

X_test = pl.read_csv("../../../../../kaggle/titanic/test.csv")
y_pred = eval(ruleset_est._best_ruleset_.replace("X", "X_test"))

[10]:

# Create submission file (Kaggle leaderboard score: 0.612)
pl.DataFrame({"PassengerId": X_test["PassengerId"], "Survived": y_pred}).with_columns(
    pl.col("Survived").cast(pl.Int64)
).write_csv("submission_titanic.csv")

[ ]: