Overview
Competition:
ICR - Identifying Age-Related Conditions
(Featured Code Competition)
Tags:
Tabular / Binary Classification / Health
Timeline:
2023/05/12 - 2023/08/11 (JST)
Evaluation:
balaced Log Loss
Result:
206th / 6,430 (Solo Silver medal), Private Score: 0.40019, Submission Entries: 7
Code
Notebook Option: GPU is note used / Internet off
Preparation
import libraries (to use data/model handling)
# import libraries
import sys
sys.path.append("/kaggle/input/iterativestratification")
import gc
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import LabelEncoder
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import warnings
warnings.simplefilter("ignore")
print("imported.")
read train/test/option data as pandas dataframe
# read train-data
df_train = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
df_train = df_train.rename(columns={"BD ": "BD", "CD ": "CD", "CW ": "CW", "FD ": "FD"})
df_train
# read test-data
df_test = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")
df_test = df_test.rename(columns={"BD ": "BD", "CD ": "CD", "CW ": "CW", "FD ": "FD"})
df_test
# read greeks-data
df_greeks = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/greeks.csv")
df_greeks = df_greeks.astype("category")
df_greeks
Data preprocessing
data handling with greeks(optional data)
# split Epsilon (Month/Day/Year)
epsilon = list(df_greeks["Epsilon"])
eps = []
for i in range(len(epsilon)):
if epsilon[i] != "Unknown":
eps.append(list(map(int, epsilon[i].split("/"))))
else:
eps.append([0, 0, 0])
df_epsilon = pd.DataFrame(eps, columns=["Month", "Day", "Year"])
df_greeks = pd.concat([df_greeks, df_epsilon], axis=1)
df_greeks
encoding categorical features
# encode categorical features
le = LabelEncoder()
df_train["EJ"] = le.fit_transform(df_train["EJ"])
df_test["EJ"] = le.transform(df_test["EJ"])
print("[train]")
print(df_train["EJ"].value_counts())
print("[test]")
print(df_test["EJ"].value_counts())
# encode categorical features(Greeks)
df_greeks["Alpha"] = le.fit_transform(df_greeks["Alpha"])
df_greeks["Beta"] = le.fit_transform(df_greeks["Beta"])
df_greeks["Gamma"] = le.fit_transform(df_greeks["Gamma"])
df_greeks["Delta"] = le.fit_transform(df_greeks["Delta"])
df_greeks["Epsilon"] = le.fit_transform(df_greeks["Epsilon"])
print(df_greeks["Alpha"].value_counts())
print(df_greeks["Beta"].value_counts())
print(df_greeks["Gamma"].value_counts())
print(df_greeks["Delta"].value_counts())
print(df_greeks["Epsilon"].value_counts())
shaping datasets for model-fitting
# set dataset for train
train_data = df_train.drop(columns=["Id", "BZ", "DV"])
cols = train_data.columns[:-1]
train_data
# set dataset for test
x_test = df_test.drop(columns=["Id", "BZ", "DV"])
id_test = df_test[["Id"]]
x_test
definition of Evaluation index(balanced Log Loss)
# calculate Balanced LogLoss
def balancedLogLoss(y_true, y_pred):
y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
N = np.bincount(y_true)
w0, w1 = 1 / (N[0] / y_true.shape[0]), 1 / (N[1] / y_true.shape[0])
sum_zero = np.sum(np.where(y_true == 0, 1, 0) * np.log(y_pred[:, 0]))
sum_one = np.sum(np.where(y_true != 0, 1, 0) * np.log(y_pred[:, 1]))
BalancedLogLoss = -((w0 / N[0]) * sum_zero + (w1 / N[1]) * sum_one) / (w0 + w1)
return BalancedLogLoss
print("defined.")
Fitting and Prediction
setting parameters
# random seeds
seeds = range(124)
#seeds = [3, 22, 45, 123]
#seeds += random.sample(range(4, 22), 10) + random.sample(range(23, 45), 11) + random.sample(range(46, 123), 20)
#seeds.sort()
#print("selected seeds:", seeds)
#print("")
# parameter
n_splits = 5
best_BLL = 100000
best_y_test_preds = []
best_oof = np.zeros((len(train_data), 2))
best_imp = pd.DataFrame()
fitting and prediction using CatBoostClassifier with MultilabelStratifiedKFold and seed-averaging
# fitting by CatBoost with Multi-Class Stratified K-Fold cross-validation
for seed in seeds:
y_test_preds = []
oof = np.zeros((len(train_data), 2))
imp = pd.DataFrame()
cv = list(MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed).split(train_data, df_greeks.iloc[:, 1:]))
print("-"*20, "seed:", seed, "-"*20)
params = {
"loss_function": "MultiClass",
"eval_metric": "MultiClass:use_weights=False",
"n_estimators": 10000,
"learning_rate": 0.005,
"random_state": seed,
"l2_leaf_reg": 1,
"auto_class_weights": "Balanced",
"use_best_model": True,
"max_ctr_complexity": 15,
"depth": 10,
"grow_policy": "Lossguide",
"max_leaves": 64,
"min_data_in_leaf": 40,
}
for nfold in np.arange(n_splits):
idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
x_tr, y_tr = train_data.loc[idx_tr, cols], train_data.loc[idx_tr, "Class"]
x_va, y_va = train_data.loc[idx_va, cols], train_data.loc[idx_va, "Class"]
train_pool = Pool(x_tr, y_tr)
valid_pool = Pool(x_va, y_va)
# fitting
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=valid_pool,
verbose=False,
early_stopping_rounds=1000,
use_best_model=True
)
# prediction
y_tr_pred = model.predict_proba(x_tr)
y_va_pred = model.predict_proba(x_va)
oof[idx_va, :] = y_va_pred
y_test_preds.append(model.predict_proba(x_test))
print("Balanced LogLoss", nfold, ":", "{:.5f}".format(balancedLogLoss(y_va, y_va_pred)))
# importance of features
_imp = pd.DataFrame({"features": cols, "importance": model.feature_importances_, "nfold": nfold})
imp = pd.concat([imp, _imp], axis=0, ignore_index=True)
del idx_tr, idx_va, x_tr, x_va, y_tr, y_va, train_pool, valid_pool, model, y_tr_pred, y_va_pred
gc.collect()
# Balanced LogLoss
BLL = balancedLogLoss(train_data["Class"], oof)
if BLL < best_BLL:
best_BLL = BLL
best_y_test_preds = y_test_preds
best_oof = oof
best_imp = imp
print("Best Balanced LogLoss(Temporary):", "{:.5f}".format(best_BLL))
print("")
del BLL, y_test_preds, oof, imp
gc.collect()
print("-"*20, "result", "-"*20)
print("Best Balanced LogLoss:", "{:.5f}".format(balancedLogLoss(train_data["Class"], best_oof)))
print("")
print("-"*14, "feature importance", "-"*14)
print("")
best_imp = best_imp.groupby("features")["importance"].agg(["mean", "std"])
best_imp.columns = ["importance", "importance_std"]
best_imp["importance_cov"] = best_imp["importance_std"] / best_imp["importance"]
best_imp = best_imp.reset_index(drop=False)
display(best_imp.sort_values("importance", ascending=False, ignore_index=True))
Submission
Taking the average of the prediction results and setting them as pandas dataframe
# set dataset for submission
sample_sub = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv")
df_submit = pd.DataFrame(columns=sample_sub.columns)
df_submit["Id"] = id_test["Id"]
df_submit[["class_0", "class_1"]] = np.mean(best_y_test_preds, axis=0)
df_submit
# submission
df_submit.to_csv("submission.csv", index=None)
print("completed.")
Best regards, e271828