Python - Data Analysis

With the data cleaned, and an understanding of the relationships between variables, I now build predictive models to understand patient copayments and insurance claim approvals.

These models are primarily built with the scikit-learn library.

Import the Data and Clean

I start my analysis by importing the data and performing some basic cleaning.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

pharmacy_data_raw = pd.read_csv("pharmacy_tx.csv")

# recode variables
pharmacy_data = \
    pharmacy_data_raw \
    .assign(
        claim_status = pd.Categorical(pharmacy_data_raw.rejected),
        diagnosis = pd.Categorical(pharmacy_data_raw.diagnosis),
        drug = pd.Categorical(pharmacy_data_raw.drug),
        bin = pd.Categorical(pharmacy_data_raw.bin),
        pcn = pd.Categorical(pharmacy_data_raw.pcn),
        group = pd.Categorical(pharmacy_data_raw.group)
    )

pharmacy_data = \
    pharmacy_data \
    .assign(
        claim_status = lambda x: x.claim_status.cat.rename_categories({0: "Approved", 1: "Rejected"})
    )

# create month varaible
pharmacy_data = \
    pharmacy_data \
    .assign(
        tx_month = pd.DatetimeIndex(pharmacy_data["tx_date"]).month
    )

# drop rejected claims
pharmacy_data = pharmacy_data[pharmacy_data.claim_status == "Approved"]

pharmacy_data = \
    pharmacy_data \
    .assign(log_patient_pay = np.log(pharmacy_data_raw.patient_pay))


# the data set is too large for my local machine to run models on!
pharmacy_data = pharmacy_data.sample(n = 10000, random_state = 1)
/Users/nickjenkins/.pyenv/versions/3.10.4/lib/python3.10/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)

Create Train Test Split

Next, I split the data into training and testing partitions.

# train-test split
pharmacy_train, pharmacy_test = train_test_split(
    pharmacy_data,
    shuffle = True,
    random_state = 1,
    test_size = 0.2
)

Objective I: Predict Patient Copayments

Model Building

Model Specifications

With the data prepped, I now create the model specifications complete with several preprocessing steps. These steps include:

  • Creating dummy variables for each categorical variable
  • Grouping infrequently occurring diagnoses into an “Other Diagnoses” category
import sklearn.preprocessing as pre
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
nominal_features = ["diagnosis", "drug", "bin", "pcn", "group", "tx_month"]

nominal_pipeline = \
    Pipeline([
        ("onehot", pre.OneHotEncoder(handle_unknown = "ignore", min_frequency = 20))
    ])

preprocessing_pipeline = \
    ColumnTransformer([
        ("nominal_preprocessor", nominal_pipeline, nominal_features)
    ])

Models

To predict patient copayments, I will use the following models:

  • Regularized linear regression
  • K-nearest neighbors
  • Boosted tree
  • Random forest
  • Support vector machine
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
import xgboost
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
# linear regression
lasso_pipe = Pipeline([
    ("preprocessor", preprocessing_pipeline),
    ("lasso", Lasso(random_state = 2))
])

# k-nearest neighbors
knn_pipe = Pipeline([
    ("preprocessor", preprocessing_pipeline),
    ("knn", KNeighborsRegressor())
])

# xgboost
xbg_pipe = Pipeline([
    ("preprocessor", preprocessing_pipeline),
    ("xgb", xgboost.XGBRegressor())
])

# random forest
rf_pipe = Pipeline([
    ("preprocessor", preprocessing_pipeline),
    ("rf", RandomForestRegressor(n_estimators = 1000))
])

# support vector machine
svm_pipe = Pipeline([
    ("preprocessor", preprocessing_pipeline),
    ("svm", SVR())
])

Model Tuning and Fitting

Define Parameter Grids

This section defines the parameter grids for model tuning.

# set grid search parameters
alphas = [0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1, 5, 10, 25, 50, 100, 500]
param_range = [1, 5, 10, 15, 20, 25]
param_range_fl = [1.0, 0.75, 0.50, 0.25, 0.10]
n_estimators = [250, 750, 1250, 1750, 2000]
learning_rates = [0.1, 0.2, 0.3, 0.4, 0.5]

# linear model parameter grid
lasso_param_grid = [
    {"lasso__alpha": alphas}
]

# knn parameter grid
knn_param_grid = [
    {"knn__n_neighbors": param_range,
    "knn__weights": ["uniform", "distance"],
    "knn__metric": ["euclidean", "manhattan"]}
]

# xgboost parameter grid
xgb_param_grid = [
    {"xgb__learning_rate": learning_rates,
    "xgb__max_depth": param_range,
    "xgb__min_child_weight": param_range[:2],
    "xgb__subsample": param_range_fl,
    "xgb__n_estimators": n_estimators}
]

# random forest parameter grid
rf_param_grid = {
    "rf__min_samples_leaf": param_range,
    "rf__min_samples_split": param_range[1:]
}

# support vector machine parameter grid
svm_param_grid = [
    {"svm__kernel": ["linear", "rbf"],
    "svm__C": param_range}
]

Search Grids

Here we set up the parameter search grids.

lasso_grid_search = \
    GridSearchCV(
        estimator = lasso_pipe,
        param_grid = lasso_param_grid,
        scoring = "neg_root_mean_squared_error",
        cv = 5,
        n_jobs = 4
    )

knn_grid_search = \
    GridSearchCV(
        estimator = knn_pipe,
        param_grid = knn_param_grid,
        scoring = "neg_root_mean_squared_error",
        cv = 5,
        n_jobs = 4
    )

xgboost_grid_search = \
    GridSearchCV(
        estimator = xbg_pipe,
        param_grid = xgb_param_grid,
        scoring = "neg_root_mean_squared_error",
        cv = 5,
        n_jobs = 4
    )

rf_grid_search = \
    GridSearchCV(
        estimator = rf_pipe,
        param_grid = rf_param_grid,
        scoring = "neg_root_mean_squared_error",
        cv = 5,
        n_jobs = 4
    )

svm_grid_search = \
    GridSearchCV(
        estimator = svm_pipe,
        param_grid = svm_param_grid,
        scoring = "neg_root_mean_squared_error",
        cv = 5,
        n_jobs = 4
    )

Model Testing

Before fitting the models to the full training set, I first estimate them on a much smaller random sample of data from the training set to identify any model specification errors. This way I can more efficiently ensure that the models will run on the full data without issue. I use a racing approach to tune all models.

import random
random.seed(2)

# test models on a smaller sample to identify errors
model_test_data = pharmacy_train.sample(1000)

lasso_grid_search.fit(
    X = model_test_data[nominal_features],
    y = model_test_data.log_patient_pay
)
knn_grid_search.fit(
    X = model_test_data[nominal_features],
    y = model_test_data.log_patient_pay
)
xgboost_grid_search.fit(
    X = model_test_data[nominal_features],
    y = model_test_data.log_patient_pay
)
rf_grid_search.fit(
    X = model_test_data[nominal_features],
    y = model_test_data.log_patient_pay
)
svm_grid_search.fit(
    X = model_test_data[nominal_features],
    y = model_test_data.log_patient_pay
)

Full Model Fitting

Now I fit and tune the models to the full cross-validation folds.

grids = [lasso_grid_search, knn_grid_search, xgboost_grid_search, rf_grid_search, svm_grid_search]

for models in grids:
    models.fit(
        X = pharmacy_train[nominal_features], 
        y = pharmacy_train.log_patient_pay
    )

import pickle
pickle.dump(grids, open("sklearn_reg_models.sav", 'wb'))

Model Evaluation

With the models fit, we now explore the best results of each model to compare their performance.

models = pickle.load(open("sklearn_reg_models.sav", 'rb'))
results = []

for model in grids:
    results.append(np.abs(model.best_score_))

rmse_scores = pd.DataFrame(
    results, 
    columns = ["rmse"],
    index = ["Lasso", "KNN", "XGBoost", "Random Forest", "SVM"]
).sort_values("rmse")

rmse_scores

rmse
XGBoost 0.148462
Random Forest 0.169219
SVM 0.187782
KNN 0.439062
Lasso 0.812565

The boosted tree had the lowest RMSE, closely followed by the random forest. We can also visually compare the best model specifications:

sns.scatterplot(
    y = "rmse",
    x = rmse_scores.index,
    data = rmse_scores
)
<AxesSubplot:ylabel='rmse'>

png

Model Finalization

Let’s examine the chosen parameters of the best performing model.

models[2].best_params_
{'xgb__learning_rate': 0.3,
 'xgb__max_depth': 6,
 'xgb__min_child_weight': 1,
 'xgb__n_estimators': 1750,
 'xgb__subsample': 1.0}

Now, we can use the tuned parameter values to finalize our model and fit the model to the full training set.

final_pipe = Pipeline([
    ("preprocessor", preprocessing_pipeline),
    ("xgb", xgboost.XGBRegressor(
        n_estimators = 1750,
        max_depth = 6, 
        min_child_weight = 1,
        subsample = 1.0,
        learning_rate = 0.3
    ))
])

final_pipe.fit(
    X = pharmacy_train[nominal_features], 
    y = pharmacy_train.log_patient_pay
)

pickle.dump(final_pipe, open("final_reg_fit.sav", 'wb'))

Finalized Predictions

# we exponentiate the rmse to get the error rate in dollars
np.exp(final_pipe.score(pharmacy_test[nominal_features], pharmacy_test.log_patient_pay))
2.652199673745764

Our model can predict patient copayments within $2.65.

We’ll also examine the model’s predictions plotted against the actual values:

preds = pd.DataFrame(
    np.exp(final_pipe.predict(pharmacy_test[nominal_features])),
    columns = ["prediction"]
)

full_data = \
    pharmacy_test \
    .reset_index() \
    .join(preds, how = "left")
plt.figure(figsize=(12, 6))

plt.scatter(
    x = "patient_pay",
    y = "prediction",
    alpha = 0.5,
    data = full_data
)
plt.xlabel("Observed Copayments")
plt.ylabel("Predicted Copayments")
Text(0, 0.5, 'Predicted Copayments')

png

Now we examine the predicted copayments by diagnosis category:

plot_data = \
    full_data \
    .assign(diagnosis_collapsed = full_data.diagnosis.str[0]) \
    .groupby("diagnosis_collapsed") \
    .aggregate({"patient_pay": "mean", "prediction": "mean"}) \
    .sort_values("patient_pay")

plt.figure(figsize=(12, 6))
sns.scatterplot(
    x = "diagnosis_collapsed",
    y = "patient_pay",
    markers = "o",
    edgecolor = "black",
    facecolor = "none",
    data = plot_data
)

sns.scatterplot(
    x = "diagnosis_collapsed",
    y = "prediction",
    facecolor = "red",
    data = plot_data
)
plt.xlabel("Broad Diagnosis Category")
plt.ylabel("Predicted Copayment")
plt.title("Open circles represent the observed copayment and the red circles show the predicted copayment.")
Text(0.5, 1.0, 'Open circles represent the observed copayment and the red circles show the predicted copayment.')

png

Diagnosis “S” is predicted to have the lowest copayment on average at just over 10 dollars, while diagnosis “Z” is predicted to have the highest with a value of about 95 dollars.

We can also visualize this figure broken down by the drug prescribed:

plot_data = \
    full_data \
    .assign(
        diagnosis_collapsed = full_data.diagnosis.str[0],
        generic_drug = pd.Categorical(np.where(full_data.drug.str.contains("generic"), 1, 0))
    )

plot_data = \
    plot_data \
    .assign(
        generic_drug = lambda x: x.generic_drug.cat.rename_categories({0: "Branded", 1: "Generic"})
    ) \
    .groupby(["diagnosis_collapsed", "generic_drug"]) \
    .aggregate({"patient_pay": "mean", "prediction": "mean"}) \
    .reset_index()

plot_data = plot_data.melt(
    id_vars = ["diagnosis_collapsed", "generic_drug"],
    var_name = "estimate",
    value_vars = ["patient_pay", "prediction"]
)

plt.figure(figsize=(12, 6))
g = sns.catplot(
    x = "generic_drug",
    y = "value",
    hue = "estimate",
    s = 10,
    col = "diagnosis_collapsed",
    col_wrap = 4,
    sharex = False,
    data = plot_data
)

(g.set_axis_labels("Drug Type", "Patient Copayment")
    .set_titles("Diagnosis: {col_name}")) 
<seaborn.axisgrid.FacetGrid at 0x2f90a2470>




<Figure size 864x432 with 0 Axes>

png

Here we can see the predicted mean copayments by drug type (branded or generic) for each broad diagnosis category.

Finally, we can examine the predicted copayment by insurance plan for each drug.

plot_data = \
    full_data \
    .assign(
        generic_drug = pd.Categorical(np.where(full_data.drug.str.contains("generic"), 1, 0))
    )

plot_data = \
    plot_data \
    .assign(
        generic_drug = lambda x: x.generic_drug.cat.rename_categories({0: "Branded", 1: "Generic"})
    ) \
    .groupby(["bin", "pcn", "generic_drug"]) \
    .aggregate({"patient_pay": "mean", "prediction": "mean"}) \
    .reset_index()

plt.figure(figsize=(12, 6))
sns.catplot(
    x = "pcn",
    y = "prediction",
    hue = "generic_drug",
    s = 10,
    col = "bin",
    col_wrap = 4,
    sharex = False,
    sharey = False,
    data = plot_data
)

(g.set_axis_labels("Drug Type", "Predicted Copayment")
    .set_titles("Diagnosis: {col_name}")) 
<seaborn.axisgrid.FacetGrid at 0x2f90a2470>




<Figure size 864x432 with 0 Axes>

png

This figure shows the predicted copayment for branded and generic drugs under each insurance plan by pcn to provide a sense of how much a drug would cost under a detailed insurance plan.

We could also examine the predicted copayment for specific drugs across insurance plans:

plot_data = \
    full_data \
    .assign(
        generic_drug = pd.Categorical(np.where(full_data.drug.str.contains("generic"), 1, 0)),
        drug_alt = full_data.drug.str.replace("generic|branded", "")
    ) \
    .assign(drug_alt = lambda x: x.drug_alt.str.strip()) 

plot_data = \
    plot_data \
    .assign(
        generic_drug = lambda x: x.generic_drug.cat.rename_categories({0: "Branded", 1: "Generic"})
    ) \
    .groupby(["bin", "drug_alt", "generic_drug"]) \
    .aggregate({"patient_pay": "mean", "prediction": "mean"}) \
    .reset_index()

plt.figure(figsize=(12, 6))
g = sns.catplot(
    x = "bin",
    y = "prediction",
    hue = "generic_drug",
    s = 10,
    col = "drug_alt",
    col_wrap = 4,
    sharex = False,
    sharey = False,
    data = plot_data
)

(g.set_axis_labels("Insurance Plan", "Predicted Copayment")
    .set_titles("Drug: {col_name}")) 
/var/folders/hm/klbdvmzd6mz_cqks_tjsp9d00000gn/T/ipykernel_13518/1610804351.py:5: FutureWarning: The default value of regex will change from True to False in a future version.
  drug_alt = full_data.drug.str.replace("generic|branded", "")





<seaborn.axisgrid.FacetGrid at 0x36969a350>




<Figure size 864x432 with 0 Axes>

png

This plot shows predicted copayment of a given drug across insurance plans. To make it easier to digest, we could select a single drug to examine. Hivir, for example:

plot_data = \
    full_data \
    .assign(
        generic_drug = pd.Categorical(np.where(full_data.drug.str.contains("generic"), 1, 0)),
        drug_alt = full_data.drug.str.replace("generic|branded", "")
    ) \
    .assign(drug_alt = lambda x: x.drug_alt.str.strip()) 

plot_data = \
    plot_data \
    .assign(
        generic_drug = lambda x: x.generic_drug.cat.rename_categories({0: "Branded", 1: "Generic"})
    ) \
    .groupby(["bin", "drug_alt", "generic_drug"]) \
    .aggregate({"patient_pay": "mean", "prediction": "mean"}) \
    .reset_index()

plot_data = plot_data[plot_data.drug_alt == "hivir"]

plt.figure(figsize=(12, 6))
sns.scatterplot(
    x = "bin",
    y = "prediction",
    hue = "generic_drug",
    data = plot_data
)
plt.xlabel("Insurance Plan")
plt.ylabel("Predicted Copayment")
plt.title("Open circles represent the observed copayment and the red circles show the predicted copayment.")
/var/folders/hm/klbdvmzd6mz_cqks_tjsp9d00000gn/T/ipykernel_13518/3552438332.py:5: FutureWarning: The default value of regex will change from True to False in a future version.
  drug_alt = full_data.drug.str.replace("generic|branded", "")





Text(0.5, 1.0, 'Open circles represent the observed copayment and the red circles show the predicted copayment.')

png

Objective II: Predict the formulary status of the medication on each insurance plan

To accomplish this objective, we will use classification models to estimate the probability of an insurance claim being rejected or not. Then, we group these predicted probabilities into three groups to capture their formulary status according to rejected, non-preferred, and preferred:

  • Predicted probabilities between $[0, 0.5)$ are grouped into the rejected group
  • Predicted probabilities between $[0.5, 0.75)$ are grouped into the non-preferred group
  • Predicted probabilities between $[0.75, 1.0]$ are grouped into the preferred group

With these groups, we can further determine whether each drug is a generic, preferred non-generic, non-preferred non-generic, etc.

To start, the rejected claims need to be added back to the data set and I will create a new train-test split and validation folds.

pharmacy_train.head()

tx_date pharmacy diagnosis drug bin pcn group rejected patient_pay claim_status tx_month log_patient_pay
5556656 2022-06-07 Pharmacy #47 U61.13 generic momudobatin 725700 1UQC NaN False 37.35 Approved 6 3.620333
11088122 2022-10-21 Pharmacy #36 K87.68 generic vocopirin 664344 NaN 52H8KH0F83K False 6.11 Approved 10 1.809927
7189146 2022-07-18 Pharmacy #26 K87.68 branded cuxirin 725700 9C5MOR3 S2QKZ0OFNWS6X False 99.85 Approved 7 4.603669
2776853 2022-03-22 Pharmacy #12 V80.87 branded cycloxasonol 322463 NaN HO8HUGL False 6.20 Approved 3 1.824549
2423061 2022-03-13 Pharmacy #35 Y51.55 branded sorine 664344 DY4B NaN False 20.74 Approved 3 3.032064
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

pharmacy_data_raw = pd.read_csv("pharmacy_tx.csv")

# recode variables
pharmacy_data = \
    pharmacy_data_raw \
    .assign(
        claim_status = pd.Categorical(pharmacy_data_raw.rejected),
        diagnosis = pd.Categorical(pharmacy_data_raw.diagnosis),
        drug = pd.Categorical(pharmacy_data_raw.drug),
        bin = pd.Categorical(pharmacy_data_raw.bin),
        pcn = pd.Categorical(pharmacy_data_raw.pcn),
        group = pd.Categorical(pharmacy_data_raw.group),
        claim_outcome = np.where(pharmacy_data_raw.rejected == True, 1, 0)
    )

pharmacy_data = \
    pharmacy_data \
    .assign(
        claim_status = lambda x: x.claim_status.cat.rename_categories({0: "Approved", 1: "Rejected"})
    )

# create month varaible
pharmacy_data = \
    pharmacy_data \
    .assign(
        tx_month = pd.DatetimeIndex(pharmacy_data["tx_date"]).month
    )


# the data set is too large for my local machine to run models on!
pharmacy_data = pharmacy_data.sample(n = 10000, random_state = 1)

# train-test split
pharmacy_train, pharmacy_test = train_test_split(
    pharmacy_data,
    shuffle = True,
    random_state = 1,
    test_size = 0.2,
    stratify = pharmacy_data["claim_status"]
)

Model Building

Model Specifications

With the data prepped, I now create the model specifications complete with several preprocessing steps. These steps include:

  • Creating dummy variables for each categorical variable
  • Grouping infrequently occurring diagnoses into an “Other Diagnoses” category

As such, I use the preprocesors above.

Models

To classify claim status, I will use the following models:

  • K-nearest neighbors
  • Binomial regression (with a Logit link function)
  • Boosted tree
  • Random forest
  • Support vector machine
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# k-nearest neighbors
knn_c_pipe = Pipeline([
    ("preprocessor", preprocessing_pipeline),
    ("knn_c", KNeighborsClassifier())
])

# binomial regression
logit_pipe = Pipeline([
    ("preprocessor", preprocessing_pipeline),
    ("logit", LogisticRegression(random_state = 2))
])

# xgboost
xbg_c_pipe = Pipeline([
    ("preprocessor", preprocessing_pipeline),
    ("xgb_c", xgboost.XGBClassifier())
])

# random forest
rf_c_pipe = Pipeline([
    ("preprocessor", preprocessing_pipeline),
    ("rf_c", RandomForestClassifier(n_estimators = 1000))
])

# support vector machine
svm_c_pipe = Pipeline([
    ("preprocessor", preprocessing_pipeline),
    ("svm_c", SVC())
])

Model Tuning and Fitting

Define Parameter Grids

This section defines the parameter grids for model tuning.

# set grid search parameters
neighbors = list(range(1, 40))
param_range = [1, 5, 10, 15, 20, 25]
param_range_fl = [1.0, 0.75, 0.50, 0.25, 0.10]
n_estimators = [250, 750, 1250, 1750, 2000]
learning_rates = [0.1, 0.2, 0.3, 0.4, 0.5]

# knn parameter grid
knn_c_param_grid = [
    {"knn_c__n_neighbors": neighbors,
    "knn_c__weights": ["uniform", "distance"],
    "knn_c__metric": ["euclidean", "manhattan"]}
]

# binomial parameter grid
logit_param_grid = [
    {"logit__penalty": ["none", "l2", "l1", "elasticnet"],
    "logit__C": param_range}
]

# xgboost parameter grid
xgb_c_param_grid = [
    {"xgb_c__learning_rate": learning_rates,
    "xgb_c__max_depth": param_range,
    "xgb_c__min_child_weight": param_range[:2],
    "xgb_c__subsample": param_range_fl,
    "xgb_c__n_estimators": n_estimators}
]

# random forest parameter grid
rf_c_param_grid = {
    "rf_c__min_samples_leaf": param_range,
    "rf_c__min_samples_split": param_range[1:]
}

# support vector machine parameter grid
svm_c_param_grid = [
    {"svm_c__kernel": ["linear", "rbf"],
    "svm_c__C": param_range}
]

Search Grids

Here we set up the parameter search grids.

knn_c_grid_search = \
    GridSearchCV(
        estimator = knn_pipe,
        param_grid = knn_param_grid,
        scoring = "accuracy",
        cv = 5,
        n_jobs = 4
    )

logit_grid_search = \
    GridSearchCV(
        estimator = logit_pipe,
        param_grid = logit_param_grid,
        scoring = "accuracy",
        cv = 5,
        n_jobs = 4
    )

xgboost_c_grid_search = \
    GridSearchCV(
        estimator = xbg_c_pipe,
        param_grid = xgb_c_param_grid,
        scoring = "accuracy",
        cv = 5,
        n_jobs = 4
    )

rf_c_grid_search = \
    GridSearchCV(
        estimator = rf_c_pipe,
        param_grid = rf_c_param_grid,
        scoring = "accuracy",
        cv = 5,
        n_jobs = 4
    )

svm_c_grid_search = \
    GridSearchCV(
        estimator = svm_c_pipe,
        param_grid = svm_c_param_grid,
        scoring = "accuracy",
        cv = 5,
        n_jobs = 4
    )

Model Testing

Before fitting the models to the full training set, I first estimate them on a much smaller random sample of data from the training set to identify any model specification errors. This way I can more efficiently ensure that the models will run on the full data without issue. I use a racing approach to tune all models.

import random
random.seed(2)

# test models on a smaller sample to identify errors
model_test_data = pharmacy_train.sample(1000)

knn_c_grid_search.fit(
    X = model_test_data[nominal_features],
    y = model_test_data.claim_outcome
)
logit_grid_search.fit(
    X = model_test_data[nominal_features],
    y = model_test_data.claim_outcome
)
xgboost_c_grid_search.fit(
    X = model_test_data[nominal_features],
    y = model_test_data.claim_outcome
)
rf_c_grid_search.fit(
    X = model_test_data[nominal_features],
    y = model_test_data.claim_outcome
)
svm_c_grid_search.fit(
    X = model_test_data[nominal_features],
    y = model_test_data.claim_outcome
)

Full Model Fitting

Now I fit and tune the models to the full cross-validation folds.

c_grids = [knn_c_grid_search, logit_grid_search, xgboost_c_grid_search, rf_c_grid_search, svm_c_grid_search]

for models in c_grids:
    models.fit(
        X = pharmacy_train[nominal_features], 
        y = pharmacy_train.claim_outcome
    )

import pickle
pickle.dump(c_grids, open("sklearn_c_models.sav", 'wb'))
c_results = []

for model in c_grids:
    print(model.best_score_)

0.921375
0.9190000000000002
0.921625
0.92125
0.92125
models.best_score_
0.92125
c_results = []

for model in c_grids:
    c_results.append(model.best_score_)

accuracy_scores = pd.DataFrame(
    c_results, 
    columns = ["accuracy"],
    index = ["KNN", "Logit", "XGBoost", "Random Forest", "SVM"]
).sort_values("accuracy")

accuracy_scores

accuracy
Logit 0.919000
Random Forest 0.921250
SVM 0.921250
KNN 0.921375
XGBoost 0.921625
Previous