e-Log

e271828のブログだお

<kaggle> Code MEMO - ICR Competition

Overview

Competition: ICR - Identifying Age-Related Conditions (Featured Code Competition)

Tags: Tabular / Binary Classification / Health

Timeline: 2023/05/12 - 2023/08/11 (JST)

Evaluation: balaced Log Loss

Result: 206th / 6,430 (Solo Silver medal), Private Score: 0.40019, Submission Entries: 7

Code

Notebook Option: GPU is note used / Internet off

Preparation

import libraries (to use data/model handling)

# import libraries
import sys
sys.path.append("/kaggle/input/iterativestratification")

import gc
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import LabelEncoder
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import warnings
warnings.simplefilter("ignore")

print("imported.")

read train/test/option data as pandas dataframe

# read train-data
df_train = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv")
df_train = df_train.rename(columns={"BD ": "BD", "CD ": "CD", "CW ": "CW", "FD ": "FD"})
df_train
# read test-data
df_test = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv")
df_test = df_test.rename(columns={"BD ": "BD", "CD ": "CD", "CW ": "CW", "FD ": "FD"})
df_test
# read greeks-data
df_greeks = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/greeks.csv")
df_greeks = df_greeks.astype("category")
df_greeks

Data preprocessing

data handling with greeks(optional data)

# split Epsilon (Month/Day/Year)
epsilon = list(df_greeks["Epsilon"])
eps = []
for i in range(len(epsilon)):
    if epsilon[i] != "Unknown":
        eps.append(list(map(int, epsilon[i].split("/"))))
    else:
        eps.append([0, 0, 0])

df_epsilon = pd.DataFrame(eps, columns=["Month", "Day", "Year"])
df_greeks = pd.concat([df_greeks, df_epsilon], axis=1)
df_greeks

encoding categorical features

# encode categorical features
le = LabelEncoder()
df_train["EJ"] = le.fit_transform(df_train["EJ"])
df_test["EJ"] = le.transform(df_test["EJ"])

print("[train]")
print(df_train["EJ"].value_counts())
print("[test]")
print(df_test["EJ"].value_counts())
# encode categorical features(Greeks)
df_greeks["Alpha"] = le.fit_transform(df_greeks["Alpha"])
df_greeks["Beta"] = le.fit_transform(df_greeks["Beta"])
df_greeks["Gamma"] = le.fit_transform(df_greeks["Gamma"])
df_greeks["Delta"] = le.fit_transform(df_greeks["Delta"])
df_greeks["Epsilon"] = le.fit_transform(df_greeks["Epsilon"])

print(df_greeks["Alpha"].value_counts())
print(df_greeks["Beta"].value_counts())
print(df_greeks["Gamma"].value_counts())
print(df_greeks["Delta"].value_counts())
print(df_greeks["Epsilon"].value_counts())

shaping datasets for model-fitting

# set dataset for train
train_data = df_train.drop(columns=["Id", "BZ", "DV"])
cols = train_data.columns[:-1]
train_data
# set dataset for test
x_test = df_test.drop(columns=["Id", "BZ", "DV"])
id_test = df_test[["Id"]]
x_test

definition of Evaluation index(balanced Log Loss)

# calculate Balanced LogLoss
def balancedLogLoss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    N = np.bincount(y_true)
    w0, w1 = 1 / (N[0] / y_true.shape[0]), 1 / (N[1] / y_true.shape[0])
    
    sum_zero = np.sum(np.where(y_true == 0, 1, 0) * np.log(y_pred[:, 0]))
    sum_one = np.sum(np.where(y_true != 0, 1, 0) * np.log(y_pred[:, 1]))
    BalancedLogLoss = -((w0 / N[0]) * sum_zero + (w1 / N[1]) * sum_one) / (w0 + w1)
    
    return BalancedLogLoss

print("defined.")

Fitting and Prediction

setting parameters

# random seeds
seeds = range(124)
#seeds = [3, 22, 45, 123]
#seeds += random.sample(range(4, 22), 10) + random.sample(range(23, 45), 11) + random.sample(range(46, 123), 20)
#seeds.sort()
#print("selected seeds:", seeds)
#print("")

# parameter
n_splits = 5
best_BLL = 100000
best_y_test_preds = []
best_oof = np.zeros((len(train_data), 2))
best_imp = pd.DataFrame()

fitting and prediction using CatBoostClassifier with MultilabelStratifiedKFold and seed-averaging

# fitting by CatBoost with Multi-Class Stratified K-Fold cross-validation
for seed in seeds:
    y_test_preds = []
    oof = np.zeros((len(train_data), 2))
    imp = pd.DataFrame()
    cv = list(MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed).split(train_data, df_greeks.iloc[:, 1:]))
    print("-"*20, "seed:", seed, "-"*20)
    
    params = {
        "loss_function": "MultiClass",
        "eval_metric": "MultiClass:use_weights=False",
        "n_estimators": 10000,
        "learning_rate": 0.005,
        "random_state": seed,
        "l2_leaf_reg": 1,
        "auto_class_weights": "Balanced",
        "use_best_model": True,
        "max_ctr_complexity": 15,
        "depth": 10,
        "grow_policy": "Lossguide",
        "max_leaves": 64,
        "min_data_in_leaf": 40,
    }
    
    for nfold in np.arange(n_splits):
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = train_data.loc[idx_tr, cols], train_data.loc[idx_tr, "Class"]
        x_va, y_va = train_data.loc[idx_va, cols], train_data.loc[idx_va, "Class"]
        train_pool = Pool(x_tr, y_tr)
        valid_pool = Pool(x_va, y_va)
        
        # fitting
        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=valid_pool,
                 verbose=False,
                 early_stopping_rounds=1000,
                 use_best_model=True
                 )
        
        # prediction
        y_tr_pred = model.predict_proba(x_tr)
        y_va_pred = model.predict_proba(x_va)
        oof[idx_va, :] = y_va_pred
        y_test_preds.append(model.predict_proba(x_test))
        print("Balanced LogLoss", nfold, ":", "{:.5f}".format(balancedLogLoss(y_va, y_va_pred)))
        
        # importance of features
        _imp = pd.DataFrame({"features": cols, "importance": model.feature_importances_, "nfold": nfold})
        imp = pd.concat([imp, _imp], axis=0, ignore_index=True)
        
        del idx_tr, idx_va, x_tr, x_va, y_tr, y_va, train_pool, valid_pool, model, y_tr_pred, y_va_pred
        gc.collect()
    
    # Balanced LogLoss
    BLL = balancedLogLoss(train_data["Class"], oof)
    if BLL < best_BLL:
        best_BLL = BLL
        best_y_test_preds = y_test_preds
        best_oof = oof
        best_imp = imp
    
    print("Best Balanced LogLoss(Temporary):", "{:.5f}".format(best_BLL))
    print("")
    del BLL, y_test_preds, oof, imp
    gc.collect()

print("-"*20, "result", "-"*20)
print("Best Balanced LogLoss:", "{:.5f}".format(balancedLogLoss(train_data["Class"], best_oof)))
print("")

print("-"*14, "feature importance", "-"*14)
print("")
best_imp = best_imp.groupby("features")["importance"].agg(["mean", "std"])
best_imp.columns = ["importance", "importance_std"]
best_imp["importance_cov"] = best_imp["importance_std"] / best_imp["importance"]
best_imp = best_imp.reset_index(drop=False)
display(best_imp.sort_values("importance", ascending=False, ignore_index=True))

Submission

Taking the average of the prediction results and setting them as pandas dataframe

# set dataset for submission
sample_sub = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv")
df_submit = pd.DataFrame(columns=sample_sub.columns)
df_submit["Id"] = id_test["Id"]
df_submit[["class_0", "class_1"]] = np.mean(best_y_test_preds, axis=0)
df_submit
# submission
df_submit.to_csv("submission.csv", index=None)
print("completed.")

Best regards, e271828