Overview
Competition: ICR - Identifying Age-Related Conditions (Featured Code Competition)
Tags: Tabular / Binary Classification / Health
Timeline: 2023/05/12 - 2023/08/11 (JST)
Evaluation: balaced Log Loss
Result: 206th / 6,430 (Solo Silver medal), Private Score: 0.40019, Submission Entries: 7
Code
Notebook Option: GPU is note used / Internet off
Preparation
import libraries (to use data/model handling)
# import libraries import sys sys.path.append("/kaggle/input/iterativestratification") import gc import random import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from catboost import CatBoostClassifier, Pool from sklearn.preprocessing import LabelEncoder from iterstrat.ml_stratifiers import MultilabelStratifiedKFold import warnings warnings.simplefilter("ignore") print("imported.")
read train/test/option data as pandas dataframe
# read train-data df_train = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/train.csv") df_train = df_train.rename(columns={"BD ": "BD", "CD ": "CD", "CW ": "CW", "FD ": "FD"}) df_train
# read test-data df_test = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/test.csv") df_test = df_test.rename(columns={"BD ": "BD", "CD ": "CD", "CW ": "CW", "FD ": "FD"}) df_test
# read greeks-data df_greeks = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/greeks.csv") df_greeks = df_greeks.astype("category") df_greeks
Data preprocessing
data handling with greeks(optional data)
# split Epsilon (Month/Day/Year) epsilon = list(df_greeks["Epsilon"]) eps = [] for i in range(len(epsilon)): if epsilon[i] != "Unknown": eps.append(list(map(int, epsilon[i].split("/")))) else: eps.append([0, 0, 0]) df_epsilon = pd.DataFrame(eps, columns=["Month", "Day", "Year"]) df_greeks = pd.concat([df_greeks, df_epsilon], axis=1) df_greeks
encoding categorical features
# encode categorical features le = LabelEncoder() df_train["EJ"] = le.fit_transform(df_train["EJ"]) df_test["EJ"] = le.transform(df_test["EJ"]) print("[train]") print(df_train["EJ"].value_counts()) print("[test]") print(df_test["EJ"].value_counts())
# encode categorical features(Greeks) df_greeks["Alpha"] = le.fit_transform(df_greeks["Alpha"]) df_greeks["Beta"] = le.fit_transform(df_greeks["Beta"]) df_greeks["Gamma"] = le.fit_transform(df_greeks["Gamma"]) df_greeks["Delta"] = le.fit_transform(df_greeks["Delta"]) df_greeks["Epsilon"] = le.fit_transform(df_greeks["Epsilon"]) print(df_greeks["Alpha"].value_counts()) print(df_greeks["Beta"].value_counts()) print(df_greeks["Gamma"].value_counts()) print(df_greeks["Delta"].value_counts()) print(df_greeks["Epsilon"].value_counts())
shaping datasets for model-fitting
# set dataset for train train_data = df_train.drop(columns=["Id", "BZ", "DV"]) cols = train_data.columns[:-1] train_data
# set dataset for test x_test = df_test.drop(columns=["Id", "BZ", "DV"]) id_test = df_test[["Id"]] x_test
definition of Evaluation index(balanced Log Loss)
# calculate Balanced LogLoss def balancedLogLoss(y_true, y_pred): y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15) N = np.bincount(y_true) w0, w1 = 1 / (N[0] / y_true.shape[0]), 1 / (N[1] / y_true.shape[0]) sum_zero = np.sum(np.where(y_true == 0, 1, 0) * np.log(y_pred[:, 0])) sum_one = np.sum(np.where(y_true != 0, 1, 0) * np.log(y_pred[:, 1])) BalancedLogLoss = -((w0 / N[0]) * sum_zero + (w1 / N[1]) * sum_one) / (w0 + w1) return BalancedLogLoss print("defined.")
Fitting and Prediction
setting parameters
# random seeds seeds = range(124) #seeds = [3, 22, 45, 123] #seeds += random.sample(range(4, 22), 10) + random.sample(range(23, 45), 11) + random.sample(range(46, 123), 20) #seeds.sort() #print("selected seeds:", seeds) #print("") # parameter n_splits = 5 best_BLL = 100000 best_y_test_preds = [] best_oof = np.zeros((len(train_data), 2)) best_imp = pd.DataFrame()
fitting and prediction using CatBoostClassifier with MultilabelStratifiedKFold and seed-averaging
# fitting by CatBoost with Multi-Class Stratified K-Fold cross-validation for seed in seeds: y_test_preds = [] oof = np.zeros((len(train_data), 2)) imp = pd.DataFrame() cv = list(MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed).split(train_data, df_greeks.iloc[:, 1:])) print("-"*20, "seed:", seed, "-"*20) params = { "loss_function": "MultiClass", "eval_metric": "MultiClass:use_weights=False", "n_estimators": 10000, "learning_rate": 0.005, "random_state": seed, "l2_leaf_reg": 1, "auto_class_weights": "Balanced", "use_best_model": True, "max_ctr_complexity": 15, "depth": 10, "grow_policy": "Lossguide", "max_leaves": 64, "min_data_in_leaf": 40, } for nfold in np.arange(n_splits): idx_tr, idx_va = cv[nfold][0], cv[nfold][1] x_tr, y_tr = train_data.loc[idx_tr, cols], train_data.loc[idx_tr, "Class"] x_va, y_va = train_data.loc[idx_va, cols], train_data.loc[idx_va, "Class"] train_pool = Pool(x_tr, y_tr) valid_pool = Pool(x_va, y_va) # fitting model = CatBoostClassifier(**params) model.fit(train_pool, eval_set=valid_pool, verbose=False, early_stopping_rounds=1000, use_best_model=True ) # prediction y_tr_pred = model.predict_proba(x_tr) y_va_pred = model.predict_proba(x_va) oof[idx_va, :] = y_va_pred y_test_preds.append(model.predict_proba(x_test)) print("Balanced LogLoss", nfold, ":", "{:.5f}".format(balancedLogLoss(y_va, y_va_pred))) # importance of features _imp = pd.DataFrame({"features": cols, "importance": model.feature_importances_, "nfold": nfold}) imp = pd.concat([imp, _imp], axis=0, ignore_index=True) del idx_tr, idx_va, x_tr, x_va, y_tr, y_va, train_pool, valid_pool, model, y_tr_pred, y_va_pred gc.collect() # Balanced LogLoss BLL = balancedLogLoss(train_data["Class"], oof) if BLL < best_BLL: best_BLL = BLL best_y_test_preds = y_test_preds best_oof = oof best_imp = imp print("Best Balanced LogLoss(Temporary):", "{:.5f}".format(best_BLL)) print("") del BLL, y_test_preds, oof, imp gc.collect() print("-"*20, "result", "-"*20) print("Best Balanced LogLoss:", "{:.5f}".format(balancedLogLoss(train_data["Class"], best_oof))) print("") print("-"*14, "feature importance", "-"*14) print("") best_imp = best_imp.groupby("features")["importance"].agg(["mean", "std"]) best_imp.columns = ["importance", "importance_std"] best_imp["importance_cov"] = best_imp["importance_std"] / best_imp["importance"] best_imp = best_imp.reset_index(drop=False) display(best_imp.sort_values("importance", ascending=False, ignore_index=True))
Submission
Taking the average of the prediction results and setting them as pandas dataframe
# set dataset for submission sample_sub = pd.read_csv("/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv") df_submit = pd.DataFrame(columns=sample_sub.columns) df_submit["Id"] = id_test["Id"] df_submit[["class_0", "class_1"]] = np.mean(best_y_test_preds, axis=0) df_submit
# submission df_submit.to_csv("submission.csv", index=None) print("completed.")
Best regards, e271828