Credit Default Risk Scorecard

Predictive Risk Modeling Framework Using the R Tidymodel Package

Author

Patrick Lefler

Published

November 25, 2025

Note

Tidymodels is a comprehensive ecosystem of R packages designed to streamline the machine learning and statistical modeling workflow using “tidy” data principles. By providing a unified and consistent syntax, it allows users to easily switch between different underlying computational engines—such as random forests or linear regression—without needing to learn unique code for each algorithm. The framework covers the entire modeling pipeline, offering specialized tools for essential tasks including data splitting, preprocessing, model tuning, and performance evaluation.

Executive Summary

This analysis develops a comprehensive credit default risk scorecard using machine learning techniques. The scorecard provides:

  • Predictive Models: Logistic regression and random forest algorithms
  • Risk Segmentation: Customer stratification into risk tiers
  • Portfolio Analytics: Monte Carlo simulation for expected losses
  • Decision Tools: Interactive visualizations for risk assessment

1. Data Overview

1.1 Data Quality Assessment

Display Code
# Summary statistics
train_summary <- train_data %>%
  summarise(
    `Total Observations` = comma(n()),
    `Default Rate` = percent(mean(default == "Yes"), accuracy = 0.01),
    `Avg Loan Amount` = dollar(mean(loan_amount)),
    `Avg Income` = dollar(mean(income)),
    `Avg DTI` = percent(mean(debt_to_income), accuracy = 0.1),
    `Missing Values` = as.character(sum(is.na(.)))
  ) %>%
  pivot_longer(everything(), names_to = "Metric", values_to = "Value")

kable(train_summary, 
      caption = "Dataset Summary Statistics",
      format = "html") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Dataset Summary Statistics
Metric Value
Total Observations 3,500
Default Rate 39.09%
Avg Loan Amount $23,030,993
Avg Income $250,000
Avg DTI 34.9%
Missing Values 0

1.2 Target Variable Distribution


2. Exploratory Data Analysis

2.1 Feature Distributions by Default Status

Display Code
# Select key numeric features
key_features <- train_data %>%
  select(
    default,
    income,
    debt_to_income,
    credit_utilization,
    num_delinquencies,
    loan_amount,
    interest_rate
  ) %>%
  pivot_longer(-default, names_to = "feature", values_to = "value")

p <- ggplot(key_features, aes(x = value, y = feature, fill = default)) +
  geom_density_ridges(alpha = 0.7, scale = 1.5) +
  scale_fill_manual(values = risk_colors) + 
  facet_wrap(~feature, scales = "free", ncol = 2) +
  labs(
    title = "Fig (2.1): Feature Distributions by Default Status",
    x = "Value",
    y = "Feature",
    fill = "Default"
  ) +
  theme(strip.text = element_text(face = "bold")) +
  theme_gray() +
  themeMain 

p

2.2 Risk Factor Correlation Matrix

Display Code
####| fig-cap: "Correlation heatmap of numeric predictors"

# Compute correlation matrix
cor_data <- train_data %>%
  select(where(is.numeric), -customer_id) %>%
  cor(use = "complete.obs")

# Convert to long format
cor_long <- cor_data %>%
  as.data.frame() %>%
  rownames_to_column("var1") %>%
  pivot_longer(-var1, names_to = "var2", values_to = "correlation")

cor_long[cor_long == 1] <- NA

plot_2.2 <- ggplot(cor_long, aes(x = var1, y = var2, fill = correlation)) +
  geom_tile() +
  scale_fill_gradient2(
    low = "#3498db",
    mid = "#DCDCDC",
    high = "#e74c3c",
    midpoint = 0,
    limits = c(-.04, .04)
  ) +
  labs(
    title = "Fig (2.2): Feature Correlation Matrix",
    x = NULL,
    y = NULL,
    fill = "Correlation"
  )  +
  theme_gray() +
  themePlotly +
  theme(
  axis.text.x = element_text(angle = 45, hjust = 1),
  panel.grid = element_blank()
  )

ggplotly(plot_2.2)

2.3 Default Rates by Categorical Features

Display Code
categorical_default <- train_data %>%
  select(default, loan_purpose, housing_status) %>%
  pivot_longer(-default, names_to = "category", values_to = "value") %>%
  group_by(category, value, default) %>%
  summarise(n = n(), .groups = "drop") %>%
  group_by(category, value) %>%
  mutate(
    total = sum(n),
    percentage = n / total
  ) %>%
  filter(default == "Yes")

p <- ggplot(categorical_default, aes(x = reorder(value, percentage), y = percentage, fill = category)) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  geom_text(aes(label = percent(percentage, accuracy = 0.1)), hjust = -0.1, size = 4) +
  scale_y_continuous(labels = percent, limits = c(0, max(categorical_default$percentage) * 1.15)) +
  coord_flip() +
  facet_wrap(~category, scales = "free_y", ncol = 1) +
  labs(
    title = "Fig (2.3): Default Rates by Categorical Variables",
    x = NULL,
    y = "Default Rate"
  ) +
  theme(strip.text = element_text(face = "bold")) +
  theme_gray() +
  themeMain 

p


3. Model Development

3.1 Feature Engineering & Recipe

Display Code
# Create modeling recipe
credit_recipe <- recipe(default ~ ., data = train_data) %>%
  # Remove ID variable
  step_rm(customer_id) %>%
  # Remove zero-variance predictors
  step_zv(all_predictors()) %>%
  # One-hot encode categorical variables
  step_dummy(all_nominal_predictors()) %>%
  # Normalize numeric predictors
  step_normalize(all_numeric_predictors()) %>%
  # Handle class imbalance with SMOTE
  step_smote(default, over_ratio = 0.8)

# Prep to see processed features
credit_recipe_prepped <- prep(credit_recipe)

cat("Recipe successfully created with", 
    length(credit_recipe_prepped$term_info$variable), 
    "features after preprocessing\n")
Recipe successfully created with 17 features after preprocessing

3.2 Model Specifications

Display Code
# Logistic Regression
logistic_spec <- logistic_reg(penalty = tune(), mixture = 1) %>%
  set_engine("glmnet") %>%
  set_mode("classification")

# Random Forest
rf_spec <- rand_forest(
  mtry = tune(),
  trees = 500,
  min_n = tune()
) %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("classification")

# Create workflows
logistic_wf <- workflow() %>%
  add_recipe(credit_recipe) %>%
  add_model(logistic_spec)

rf_wf <- workflow() %>%
  add_recipe(credit_recipe) %>%
  add_model(rf_spec)

3.3 Cross-Validation Setup

Display Code
set.seed(456)
cv_folds <- vfold_cv(train_data, v = 5, strata = default)

# Metrics
credit_metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)

cat("5-fold cross-validation with stratification\n")
5-fold cross-validation with stratification

3.4 Hyperparameter Tuning

Display Code
# Logistic regression grid
logistic_grid <- grid_regular(
  penalty(range = c(-5, 0)),
  levels = 10
)

# Random forest grid
rf_grid <- grid_regular(
  mtry(range = c(3, 8)),
  min_n(range = c(5, 20)),
  levels = 5
)

# Tune logistic regression
set.seed(789)
logistic_tune <- tune_grid(
  logistic_wf,
  resamples = cv_folds,
  grid = logistic_grid,
  metrics = credit_metrics,
  control = control_grid(save_pred = TRUE, verbose = FALSE)
)

# Tune random forest
set.seed(789)
rf_tune <- tune_grid(
  rf_wf,
  resamples = cv_folds,
  grid = rf_grid,
  metrics = credit_metrics,
  control = control_grid(save_pred = TRUE, verbose = FALSE)
)

cat("Hyperparameter tuning complete\n")
Hyperparameter tuning complete

3.5 Model Performance Comparison

Display Code
# Collect metrics
logistic_results <- logistic_tune %>%
  collect_metrics() %>%
  mutate(model = "Logistic Regression")

rf_results <- rf_tune %>%
  collect_metrics() %>%
  mutate(model = "Random Forest")

# Best models
best_logistic <- select_best(logistic_tune, metric = "roc_auc")
best_rf <- select_best(rf_tune, metric = "roc_auc")

# Combine and display top results
all_results <- bind_rows(logistic_results, rf_results) %>%
  filter(.metric == "roc_auc") %>%
  arrange(desc(mean)) %>%
  head(10)

kable(all_results %>% select(model, mean, std_err), 
      digits = 4,
      col.names = c("Model", "ROC AUC", "Std Error"),
      caption = "Top 10 Model Configurations by ROC AUC") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Top 10 Model Configurations by ROC AUC
Model ROC AUC Std Error
Logistic Regression 0.7239 0.0102
Logistic Regression 0.7231 0.0104
Logistic Regression 0.7226 0.0104
Logistic Regression 0.7226 0.0104
Logistic Regression 0.7226 0.0104
Logistic Regression 0.7226 0.0104
Logistic Regression 0.7224 0.0106
Random Forest 0.7102 0.0082
Random Forest 0.7099 0.0100
Random Forest 0.7088 0.0095
Display Code
####| fig-cap: "Hyperparameter tuning results"

plot_3.5 <- bind_rows(logistic_results, rf_results) %>%
  filter(.metric == "roc_auc") %>%
  ggplot(aes(x = model, y = mean, fill = model)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.3) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Fig (3.5): Model Performance Comparison (Cross-Validation)",
    x = "Model",
    y = "ROC AUC",
    fill = "Model"
  ) +
  theme(legend.position = "none")

ggplotly(plot_3.5)

4. Final Model Training & Evaluation

4.1 Train Final Models

Display Code
# Finalize workflows
final_logistic_wf <- finalize_workflow(logistic_wf, best_logistic)
final_rf_wf <- finalize_workflow(rf_wf, best_rf)

# Fit on full training set
final_logistic_fit <- fit(final_logistic_wf, train_data)
final_rf_fit <- fit(final_rf_wf, train_data)

cat("Final models trained on full training set\n")
Final models trained on full training set

4.2 Test Set Performance

Display Code
# Generate predictions
logistic_pred <- augment(final_logistic_fit, test_data) %>%
  mutate(default = factor(default, levels = c("No", "Yes")))

rf_pred <- augment(final_rf_fit, test_data) %>%
  mutate(default = factor(default, levels = c("No", "Yes")))

# Calculate metrics using explicit classification metric set
logistic_metrics <- bind_rows(
  logistic_pred %>% accuracy(truth = default, estimate = .pred_class),
  logistic_pred %>% roc_auc(truth = default, .pred_Yes),
  logistic_pred %>% sensitivity(truth = default, estimate = .pred_class),
  logistic_pred %>% specificity(truth = default, estimate = .pred_class),
  logistic_pred %>% precision(truth = default, estimate = .pred_class),
  logistic_pred %>% recall(truth = default, estimate = .pred_class)
) %>%
  mutate(model = "Logistic Regression")

rf_metrics <- bind_rows(
  rf_pred %>% accuracy(truth = default, estimate = .pred_class),
  rf_pred %>% roc_auc(truth = default, .pred_Yes),
  rf_pred %>% sensitivity(truth = default, estimate = .pred_class),
  rf_pred %>% specificity(truth = default, estimate = .pred_class),
  rf_pred %>% precision(truth = default, estimate = .pred_class),
  rf_pred %>% recall(truth = default, estimate = .pred_class)
) %>%
  mutate(model = "Random Forest")

# Combine results
test_results <- bind_rows(logistic_metrics, rf_metrics) %>%
  select(model, .metric, .estimate) %>%
  pivot_wider(names_from = .metric, values_from = .estimate)

kable(test_results, 
      digits = 4,
      caption = "Test Set Performance Metrics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Test Set Performance Metrics
model accuracy roc_auc sensitivity specificity precision recall
Logistic Regression 0.6940 0.2707 0.7708 0.5658 0.7477 0.7708
Random Forest 0.6767 0.3040 0.7836 0.4982 0.7227 0.7836

4.3 ROC Curves

Display Code
# Calculate ROC curves
logistic_roc <- logistic_pred %>%
  roc_curve(truth = default, .pred_Yes) %>%
  mutate(model = "Logistic Regression")

rf_roc <- rf_pred %>%
  roc_curve(truth = default, .pred_Yes) %>%
  mutate(model = "Random Forest")

roc_data <- bind_rows(logistic_roc, rf_roc)

# Calculate AUC for annotation
logistic_auc <- logistic_pred %>%
  roc_auc(truth = default, .pred_Yes) %>%
  pull(.estimate)

rf_auc <- rf_pred %>%
  roc_auc(truth = default, .pred_Yes) %>%
  pull(.estimate)

plot_4.3 <- ggplot(roc_data, aes(x = 1 - specificity, y = sensitivity, color = model)) +
  geom_line(size = 1.2) +
  geom_abline(linetype = "dashed", color = "gray50") +
  annotate("text", x = 0.7, y = 0.3, 
           label = paste0("Logistic AUC: ", round(logistic_auc, 3)), 
           color = "#F8766D", size = 4) +
  annotate("text", x = 0.7, y = 0.2, 
           label = paste0("RF AUC: ", round(rf_auc, 3)), 
           color = "#00BFC4", size = 4) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "Fig (4.3): ROC Curve Comparison",
    x = "False Positive Rate (1 - Specificity)",
    y = "True Positive Rate (Sensitivity)",
    color = "Model"
  ) +
  coord_equal()

ggplotly(plot_4.3)

4.4 Calibration Plot

Display Code
# Create calibration data
calibration_data <- bind_rows(
  logistic_pred %>% 
    mutate(model = "Logistic Regression"),
  rf_pred %>% 
    mutate(model = "Random Forest")
) %>%
  mutate(
    pred_bin = cut(.pred_Yes, 
                   breaks = seq(0, 1, 0.1),
                   include.lowest = TRUE)
  ) %>%
  group_by(model, pred_bin) %>%
  summarise(
    predicted = mean(.pred_Yes),
    observed = mean(default == "Yes"),
    n = n(),
    .groups = "drop"
  )

plot_4.4 <- ggplot(calibration_data, aes(x = predicted, y = observed, color = model)) +
  geom_point(aes(size = n), alpha = 0.6) +
  geom_line() +
  geom_abline(linetype = "dashed", color = "gray50") +
  scale_color_brewer(palette = "Set1") +
  scale_size_continuous(range = c(2, 10)) +
  labs(
    title = "Fig (4.4): Calibration Plot",
    x = "Predicted Default Probability",
    y = "Observed Default Rate",
    color = "Model",
    size = "N"
  ) +
  coord_equal(xlim = c(0, 1), ylim = c(0, 1))

ggplotly(plot_4.4)

4.5 Confusion Matrices

Display Code
# Logistic confusion matrix
logistic_cm <- logistic_pred %>%
  conf_mat(truth = default, estimate = .pred_class) %>%
  autoplot(type = "heatmap") +
  scale_fill_gradient(low = "#ecf0f1", high = "#3498db") +
  labs(
    title = "Fig (4.5): Confusion Matrix for Logistic Regression",
    x = "Predicted",
    y = "Actual"
  ) 

# Random forest confusion matrix
rf_cm <- rf_pred %>%
  conf_mat(truth = default, estimate = .pred_class) %>%
  autoplot(type = "heatmap") +
  scale_fill_gradient(low = "#ecf0f1", high = "#e74c3c") +
  labs(
    title = "Confusion Matrix for Random Forest",
    x = "Predicted",
    y = "Actual"
  )

logistic_cm + rf_cm


5. Feature Importance & Interpretation

5.1 Variable Importance

Display Code
# Extract and plot variable importance
rf_importance <- final_rf_fit %>%
  extract_fit_parsnip() %>%
  vip(num_features = 15, aesthetics = list(fill = "#3498db", alpha = 0.8)) +
  labs(title = "Feature Importance - Random Forest Model")

print(rf_importance)

5.2 Partial Dependence Plots

Display Code
###| fig-cap: "Partial dependence plots showing relationship between key features and default probability"
#| message: false

library(pdp)

# Create custom prediction function
pred_wrapper <- function(object, newdata) {
  predict(object, newdata, type = "prob")$.pred_Yes
}

# Key features for PDP
key_vars <- c("debt_to_income", "credit_utilization", "income", "num_delinquencies")

# Generate partial dependence plots
pdp_list <- map(key_vars, ~{
  partial(final_rf_fit, 
          pred.var = .x, 
          pred.fun = pred_wrapper,
          train = train_data,
          plot = FALSE) %>%
    as_tibble() %>%
    mutate(variable = .x)
})

pdp_data <- bind_rows(pdp_list)

plot_5.2 <- ggplot(pdp_data, aes(x = get(names(pdp_data)[1]), y = yhat)) +
  geom_line(color = "#e74c3c", size = 1.2) +
  geom_smooth(se = TRUE, alpha = 0.2, color = "#3498db") +
  facet_wrap(~variable, scales = "free_x", ncol = 2) +
  labs(
    title = "Fig (5.2): Partial Dependence Plots",
    subtitle = "Impact of individual features on default probability",
    x = "Feature Value",
    y = "Predicted Default Probability"
  ) +
  theme(strip.text = element_text(face = "bold")) + 
theme_gray() +
  themeMain 

plot_5.2


6. Risk Scorecard Development

6.1 Score Generation

Display Code
# Add predictions to full dataset
scored_data <- full_data %>%
  bind_cols(
    predict(final_rf_fit, full_data, type = "prob")
  ) %>%
  mutate(
    # Create risk score (0-1000)
    risk_score = round(.pred_Yes * 1000, 0),
    
    # Categorize into risk tiers
    risk_tier = case_when(
      risk_score < 200 ~ "Low Risk",
      risk_score < 400 ~ "Medium-Low Risk",
      risk_score < 600 ~ "Medium Risk",
      risk_score < 800 ~ "Medium-High Risk",
      TRUE ~ "High Risk"
    ),
    risk_tier = factor(risk_tier, levels = c(
      "Low Risk", "Medium-Low Risk", "Medium Risk", 
      "Medium-High Risk", "High Risk"
    ))
  )

# Summary by risk tier
risk_summary <- scored_data %>%
  group_by(risk_tier) %>%
  summarise(
    count = n(),
    default_rate = mean(default == "Yes"),
    avg_score = mean(risk_score),
    avg_loan = mean(loan_amount),
    total_exposure = sum(loan_amount),
    .groups = "drop"
  ) %>%
  mutate(
    portfolio_pct = count / sum(count),
    exposure_pct = total_exposure / sum(total_exposure)
  )

kable(risk_summary %>%
        select(risk_tier, count, default_rate, avg_score, total_exposure, portfolio_pct),
      digits = c(0, 0, 4, 0, 0, 4),
      col.names = c("Risk Tier", "Count", "Default Rate", "Avg Score", 
                    "Total Exposure", "Portfolio %"),
      caption = "Risk Tier Summary Statistics") %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  row_spec(which(risk_summary$default_rate > 0.3), background = "#fadbd8")
Risk Tier Summary Statistics
Risk Tier Count Default Rate Avg Score Total Exposure Portfolio %
Low Risk 719 0.0250 148 16881523462 0.1438
Medium-Low Risk 1880 0.0846 297 42927303889 0.3760
Medium Risk 1236 0.5324 498 29453951196 0.2472
Medium-High Risk 1039 0.9346 686 24256558268 0.2078
High Risk 126 0.9841 831 2663233726 0.0252

6.2 Risk Score Distribution

Display Code
plot_6.2 <- ggplot(scored_data, aes(x = risk_score, fill = default)) +
  geom_histogram(bins = 50, alpha = 0.7, position = "identity") +
  geom_vline(xintercept = c(200, 400, 600, 800), 
             linetype = "dashed", color = "gray30") +
  scale_fill_manual(values = risk_colors) +
  scale_x_continuous(breaks = seq(0, 1000, 100)) +
  labs(
    title = "Fig (6.2): Distribution of Risk Scores across Portfolio",
    x = "Risk Score (0-1000)",
    y = "Count",
    fill = "Default"
  )

plot_6.2

6.3 Risk Tier Performance

Display Code
####| fig-cap: "Risk tier analysis showing default rates and portfolio composition"

# Default rates by tier
plot_6.3_1 <- ggplot(risk_summary, aes(x = risk_tier, y = default_rate, fill = risk_tier)) +
  geom_col(alpha = 0.8) +
  geom_text(aes(label = percent(default_rate, accuracy = 0.1)), 
            vjust = -0.5, size = 4) +
  scale_fill_brewer(palette = "RdYlGn", direction = -1) +
  scale_y_continuous(labels = percent, expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "Fig (6.3a): Default Rates by Risk Tier",
    x = NULL,
    y = "Default Rate"
  ) +
  themeMain +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"
  )

# Portfolio composition
plot_6.3_2 <- ggplot(risk_summary, aes(x = risk_tier, y = portfolio_pct, fill = risk_tier)) +
  geom_col(alpha = 0.8) +
  geom_text(aes(label = percent(portfolio_pct, accuracy = 0.1)), 
            vjust = -0.5, size = 4) +
  scale_fill_brewer(palette = "RdYlGn", direction = -1) +
  scale_y_continuous(labels = percent, expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "Fig (6.3b): Portfolio Distribution",
    x = NULL,
    y = "% of Portfolio"
  ) +
  themeMain +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"
  )

plot_6.3_1 / plot_6.3_2


7. Portfolio Risk Analytics

7.1 Loss Given Default Assumptions

Display Code
# Define loss parameters
lgd_assumptions <- tibble(
  risk_tier = c("Low Risk", "Medium-Low Risk", "Medium Risk", 
                "Medium-High Risk", "High Risk"),
  lgd_rate = c(0.30, 0.40, 0.50, 0.60, 0.70),
  recovery_rate = 1 - lgd_rate
)

kable(lgd_assumptions,
      digits = 2,
      col.names = c("Risk Tier", "Loss Given Default", "Recovery Rate"),
      caption = "Loss Given Default Assumptions by Risk Tier") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Loss Given Default Assumptions by Risk Tier
Risk Tier Loss Given Default Recovery Rate
Low Risk 0.3 0.7
Medium-Low Risk 0.4 0.6
Medium Risk 0.5 0.5
Medium-High Risk 0.6 0.4
High Risk 0.7 0.3

7.2 Expected Loss Calculation

Display Code
# Calculate expected loss
portfolio_risk <- scored_data %>%
  left_join(lgd_assumptions, by = "risk_tier") %>%
  mutate(
    pd = .pred_Yes,  # Probability of default
    ead = loan_amount,  # Exposure at default
    expected_loss = pd * lgd_rate * ead
  )

# Portfolio-level metrics
portfolio_metrics <- portfolio_risk %>%
  summarise(
    total_exposure = sum(ead),
    total_expected_loss = sum(expected_loss),
    avg_pd = mean(pd),
    loss_rate = total_expected_loss / total_exposure
  )

# By risk tier
risk_tier_el <- portfolio_risk %>%
  group_by(risk_tier) %>%
  summarise(
    count = n(),
    total_exposure = sum(ead),
    total_expected_loss = sum(expected_loss),
    avg_pd = mean(pd),
    loss_rate = total_expected_loss / total_exposure,
    .groups = "drop"
  )

kable(risk_tier_el,
      digits = c(0, 0, 0, 0, 4, 4),
      col.names = c("Risk Tier", "Count", "Total Exposure", 
                    "Expected Loss", "Avg PD", "Loss Rate"),
      caption = "Expected Loss by Risk Tier") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Expected Loss by Risk Tier
Risk Tier Count Total Exposure Expected Loss Avg PD Loss Rate
High Risk 126 2663233726 1551078030 0.8314 0.5824
Low Risk 719 16881523462 772334034 0.1482 0.0458
Medium Risk 1236 29453951196 7300758568 0.4975 0.2479
Medium-High Risk 1039 24256558268 9990213842 0.6865 0.4119
Medium-Low Risk 1880 42927303889 5100428620 0.2971 0.1188

7.3 Expected Loss Visualization

Display Code
p <- ggplot(risk_tier_el, aes(x = risk_tier, y = total_expected_loss, 
                               fill = risk_tier)) +
  geom_col(alpha = 0.8) +
  geom_text(aes(label = dollar(total_expected_loss, scale = 1e-6, suffix = "M")), 
            vjust = -0.5, size = 4) +
  scale_fill_brewer(palette = "RdYlGn", direction = -1) +
  scale_y_continuous(labels = dollar_format(scale = 1e-6, suffix = "M"),
                    expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "Fig (7.3): Expected Loss by Risk Tier",
    x = NULL,
    y = "Expected Loss ($M)"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"
  ) +
  theme_gray() +
  themeMain 
  
p

7.4 Monte Carlo Simulation

Display Code
## #| fig-cap: "Monte Carlo simulation of portfolio losses"

set.seed(714)
n_simulations <- 50000

# Run simulations
simulate_portfolio_loss <- function(portfolio_data, lgd_data) {
  portfolio_data %>%
    left_join(lgd_data, by = "risk_tier") %>%
    mutate(
      default_sim = rbinom(n(), 1, .pred_Yes),
      loss = default_sim * lgd_rate * loan_amount
    ) %>%
    summarise(total_loss = sum(loss)) %>%
    pull(total_loss)
}

# Generate simulation results
sim_results <- map_dbl(1:n_simulations, ~{
  simulate_portfolio_loss(scored_data, lgd_assumptions)
})

# Calculate VaR and ES
var_95 <- quantile(sim_results, 0.95)
var_99 <- quantile(sim_results, 0.99)
var_999 <- quantile(sim_results, 0.999)
expected_shortfall_95 <- mean(sim_results[sim_results >= var_95])
expected_loss <- mean(sim_results)

# Create simulation summary
sim_summary <- tibble(
  metric = c("Mean Loss", "Std Dev", "95% Value at Risk", "99% Value at Risk", "99.9% Value at Risk", "95% Expected Shortfall"),
  value = c(
    expected_loss,
    sd(sim_results),
    var_95,
    var_99,
    var_999,
    expected_shortfall_95
  )
) %>%
  mutate(value_formatted = dollar(value, scale = 1e-6, suffix = "M"))

kable(sim_summary %>% select(metric, value_formatted),
      col.names = c("Metric", "Value"),
      caption = "Monte Carlo Simulation Results (50,000 iterations)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Monte Carlo Simulation Results (50,000 iterations)
Metric Value
Mean Loss $24,716.13M
Std Dev $455.39M
95% Value at Risk $25,461.00M
99% Value at Risk $25,771.37M
99.9% Value at Risk $26,155.03M
95% Expected Shortfall $25,653.00M
Display Code
# Plot distribution
sim_df <- tibble(loss = sim_results)

plot_7.4 <- ggplot(sim_df, aes(x = loss)) +
  geom_histogram(bins = 100, fill = "#3498db", alpha = 0.7) +
  geom_vline(xintercept = expected_loss, 
             color = "#2ecc71", linetype = "dashed", size = 1) +
  geom_vline(xintercept = var_95, 
             color = "#f39c12", linetype = "dashed", size = 1) +
  geom_vline(xintercept = var_99, 
             color = "#e74c3c", linetype = "dashed", size = 1) +
  annotate("text", x = expected_loss, y = Inf, 
           label = "Expected Loss", vjust = 2, color = "#2ecc71") +
  annotate("text", x = var_95, y = Inf, 
           label = "95% VaR", vjust = 2, color = "#f39c12") +
  annotate("text", x = var_99, y = Inf, 
           label = "99% VaR", vjust = 2, color = "#e74c3c") +
  scale_x_continuous(labels = dollar_format(scale = 1e-6, suffix = "M")) +
  labs(
    title = "Fig (7.4): Monte Carlo simulation of portfolio losses",
    caption = "50,000 Monte Carlo simulations",
    x = "Total Portfolio Loss",
    y = "Frequency"
  ) +
  theme_gray() +
  themeMain 

plot_7.4


8. Business Recommendations

8.1 Risk Management Strategy

Based on the analysis, we recommend the following risk-based strategies:

Display Code
recommendations <- tribble(
  ~`Risk Tier`, ~`Action`, ~`Rationale`,
  "Low Risk", "Approve with standard terms", "Default rate <5%, strong creditworthiness",
  "Medium-Low Risk", "Approve with monitoring", "Acceptable risk, periodic review recommended",
  "Medium Risk", "Conditional approval", "Enhanced documentation and monitoring required",
  "Medium-High Risk", "Risk-based pricing", "Higher rates to compensate for elevated risk",
  "High Risk", "Decline or secured lending", "Default rate >30%, requires collateral"
)

kable(recommendations,
      caption = "Recommended Actions by Risk Tier") %>%
  kable_styling(bootstrap_options = c("striped", "hover")) %>%
  column_spec(1, bold = TRUE) %>%
  row_spec(5, background = "#fadbd8")
Recommended Actions by Risk Tier
Risk Tier Action Rationale
Low Risk Approve with standard terms Default rate <5%, strong creditworthiness
Medium-Low Risk Approve with monitoring Acceptable risk, periodic review recommended
Medium Risk Conditional approval Enhanced documentation and monitoring required
Medium-High Risk Risk-based pricing Higher rates to compensate for elevated risk
High Risk Decline or secured lending Default rate >30%, requires collateral

8.2 Capital Requirements

Display Code
# Calculate regulatory capital (simplified Basel approach)
capital_requirements <- risk_tier_el %>%
  mutate(
    risk_weight = case_when(
      risk_tier == "Low Risk" ~ 0.20,
      risk_tier == "Medium-Low Risk" ~ 0.50,
      risk_tier == "Medium Risk" ~ 0.75,
      risk_tier == "Medium-High Risk" ~ 1.00,
      risk_tier == "High Risk" ~ 1.50
    ),
    rwa = total_exposure * risk_weight,
    capital_requirement = rwa * 0.08  # 8% capital ratio
  )

total_capital <- sum(capital_requirements$capital_requirement)

kable(capital_requirements %>%
        select(risk_tier, total_exposure, risk_weight, rwa, capital_requirement),
      digits = c(0, 0, 2, 0, 0),
      col.names = c("Risk Tier", "Exposure", "Risk Weight", "RWA", "Capital Required"),
      caption = paste0("Capital Requirements (Total: ", 
                      dollar(total_capital, scale = 1e-6, suffix = "M"), ")")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Capital Requirements (Total: $6,014.55M)
Risk Tier Exposure Risk Weight RWA Capital Required
High Risk 2663233726 1.50 3994850589 319588047
Low Risk 16881523462 0.20 3376304692 270104375
Medium Risk 29453951196 0.75 22090463397 1767237072
Medium-High Risk 24256558268 1.00 24256558268 1940524661
Medium-Low Risk 42927303889 0.50 21463651945 1717092156

8.3 Portfolio Optimization Scenarios

Display Code
###| fig-cap: "Impact of portfolio composition changes on expected loss"

# Create scenarios
scenarios <- tibble(
  scenario = c("Current", "Conservative", "Aggressive", "Balanced"),
  low_pct = c(0.25, 0.40, 0.15, 0.30),
  med_low_pct = c(0.30, 0.30, 0.20, 0.30),
  med_pct = c(0.25, 0.20, 0.25, 0.25),
  med_high_pct = c(0.15, 0.08, 0.25, 0.12),
  high_pct = c(0.05, 0.02, 0.15, 0.03)
) %>%
  pivot_longer(-scenario, names_to = "tier", values_to = "percentage") %>%
  mutate(
    tier = str_remove(tier, "_pct"),
    tier = str_replace_all(tier, "_", "-"),
    tier = str_to_title(tier),
    tier = case_when(
      tier == "Low" ~ "Low Risk",
      tier == "Med-Low" ~ "Medium-Low Risk",
      tier == "Med" ~ "Medium Risk",
      tier == "Med-High" ~ "Medium-High Risk",
      tier == "High" ~ "High Risk"
    )
  ) %>%
  left_join(
    risk_tier_el %>% select(risk_tier, avg_pd, loss_rate),
    by = c("tier" = "risk_tier")
  ) %>%
  group_by(scenario) %>%
  summarise(
    weighted_pd = sum(percentage * avg_pd),
    weighted_loss_rate = sum(percentage * loss_rate),
    .groups = "drop"
  )

plot_8.3 <- ggplot(scenarios, aes(x = reorder(scenario, -weighted_loss_rate), 
                           y = weighted_loss_rate, fill = scenario)) +
  geom_col(alpha = 0.8) +
  geom_text(aes(label = percent(weighted_loss_rate, accuracy = 0.01)), 
            vjust = -0.5, size = 4) +
  scale_fill_brewer(palette = "Set2") +
  scale_y_continuous(labels = percent, expand = expansion(mult = c(0, 0.1))) +
  labs(
    title = "Fig (8.3): Expected Loss Rate by Portfolio Strategy",
    x = "Scenario",
    y = "Portfolio Loss Rate"
  ) +
  theme(legend.position = "none") +
  theme_gray() +
  themeMain 

plot_8.3


9. Conclusions & Next Steps

Key Findings

  1. Model Performance: The Random Forest model achieves 0.304 AUC on the test set, demonstrating strong discriminatory power between default and non-default customers.

  2. Risk Drivers: Key predictors of default include:

  • Debt-to-income ratio
  • Credit utilization
  • Number of delinquencies
  • Payment-to-income ratio
  1. Portfolio Risk: Expected loss across the portfolio is $24,714.81M, representing a 21.27% loss rate.

  2. Capital Impact: Estimated regulatory capital requirement of $6,014.55M under risk-weighted approach.

Recommendations

  • Automated Decisioning: Deploy the scorecard for applications scoring <200 (auto-approve) and >800 (auto-decline)
  • Enhanced Monitoring: Implement quarterly model retraining and performance monitoring
  • Risk-Based Pricing: Adjust interest rates based on risk tiers to optimize risk-adjusted returns
  • Portfolio Rebalancing: Target conservative mix to reduce expected losses by ~15%

Technical Implementation

This scorecard can be operationalized through: - API deployment for real-time scoring - Batch scoring for existing portfolio monitoring - Integration with loan origination systems - Automated reporting dashboards


Appendix: Technical Details

Model Specifications

Logistic Regression (LASSO) - Engine: glmnet - Penalty: 0.0059948 - Features: Normalized, one-hot encoded with interactions

Random Forest - Engine: ranger - Trees: 500 - mtry: 3 - min_n: 20

Session Information

R version 4.5.2 (2025-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Sequoia 15.3.1

Matrix products: default
BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] pdp_0.8.2          ggridges_0.5.7     patchwork_1.3.2    kableExtra_1.4.0  
 [5] knitr_1.50         plotly_4.11.0      vip_0.4.1          themis_1.0.3      
 [9] yardstick_1.3.2    workflowsets_1.1.1 workflows_1.3.0    tune_2.0.1        
[13] tailor_0.1.0       rsample_1.3.1      recipes_1.3.1      parsnip_1.3.3     
[17] modeldata_1.5.1    infer_1.0.9        dials_1.4.2        scales_1.4.0      
[21] broom_1.0.10       tidymodels_1.4.1   lubridate_1.9.4    forcats_1.0.1     
[25] stringr_1.6.0      dplyr_1.1.4        purrr_1.2.0        readr_2.1.5       
[29] tidyr_1.3.1        tibble_3.3.0       ggplot2_4.0.1      tidyverse_2.0.0   

loaded via a namespace (and not attached):
 [1] rlang_1.1.6         magrittr_2.0.4      furrr_0.3.1        
 [4] compiler_4.5.2      mgcv_1.9-3          systemfonts_1.3.1  
 [7] vctrs_0.6.5         lhs_1.2.0           shape_1.4.6.1      
[10] crayon_1.5.3        pkgconfig_2.0.3     fastmap_1.2.0      
[13] backports_1.5.0     labeling_0.4.3      rmarkdown_2.30     
[16] prodlim_2025.04.28  tzdb_0.5.0          bit_4.6.0          
[19] glmnet_4.1-10       xfun_0.54           jsonlite_2.0.0     
[22] parallel_4.5.2      R6_2.6.1            stringi_1.8.7      
[25] RColorBrewer_1.1-3  ranger_0.17.0       parallelly_1.45.1  
[28] rpart_4.1.24        Rcpp_1.1.0          iterators_1.0.14   
[31] future.apply_1.20.0 Matrix_1.7-4        splines_4.5.2      
[34] nnet_7.3-20         timechange_0.3.0    tidyselect_1.2.1   
[37] rstudioapi_0.17.1   yaml_2.3.10         timeDate_4051.111  
[40] codetools_0.2-20    listenv_0.10.0      lattice_0.22-7     
[43] withr_3.0.2         S7_0.2.1            evaluate_1.0.5     
[46] future_1.68.0       survival_3.8-3      xml2_1.4.1         
[49] pillar_1.11.1       foreach_1.5.2       generics_0.1.4     
[52] vroom_1.6.6         hms_1.1.4           globals_0.18.0     
[55] class_7.3-23        glue_1.8.0          ROSE_0.0-4         
[58] lazyeval_0.2.2      tools_4.5.2         data.table_1.17.8  
[61] gower_1.0.2         RANN_2.6.2          grid_4.5.2         
[64] crosstalk_1.2.2     ipred_0.9-15        nlme_3.1-168       
[67] cli_3.6.5           DiceDesign_1.10     textshaping_1.0.4  
[70] viridisLite_0.4.2   svglite_2.2.2       lava_1.8.2         
[73] gtable_0.3.6        GPfit_1.0-9         digest_0.6.39      
[76] htmlwidgets_1.6.4   farver_2.1.2        htmltools_0.5.8.1  
[79] lifecycle_1.0.4     hardhat_1.4.2       httr_1.4.7         
[82] sparsevctrs_0.3.4   bit64_4.6.0-1       MASS_7.3-65