# File Name: loan_default_prediction.R
# Author: Gerard King - www.gerardking.dev
# Title: Loan Default Prediction using Logistic Regression
# Description: This program uses logistic regression to predict the likelihood of a customer defaulting on a loan.
# It utilizes features such as credit score, income, loan amount, and loan term to classify customers into default or non-default categories.
# Use Cases:
# - Predicting loan default risk for customers
# - Identifying high-risk customers for proactive measures
# - Assessing the creditworthiness of applicants based on various factors
# Audience:
# - Data scientists working in banking and financial services
# - Loan officers and credit analysts in financial institutions
# - Students and researchers learning about machine learning in finance
# Blue Team Uses:
# - Predicting and identifying potential fraudulent loan applications
# - Monitoring the likelihood of loan defaults to reduce credit risk
# - Assessing customer risk profiles for business decisions
# Red Team Uses:
# - Simulating loan defaults to identify weaknesses in the risk assessment models
# - Analyzing various attack strategies on banking systems based on loan performance metrics
# - Detecting patterns in customer behavior that may signal fraudulent loan activity
# Current Date: 2025-03-06
# Load necessary library
library(ggplot2)
# Set the seed for reproducibility
set.seed(505)
# Generate synthetic dataset for loan applicants (features: credit score, income, loan amount, loan term)
n <- 500 # number of applicants
credit_score <- rnorm(n, mean = 650, sd = 50)
income <- rnorm(n, mean = 50000, sd = 10000)
loan_amount <- rnorm(n, mean = 15000, sd = 5000)
loan_term <- sample(c(15, 30), n, replace = TRUE) # loan term in years
default <- ifelse(credit_score < 600 & income < 40000 & loan_amount > 20000, 1, 0) # 1 means default, 0 means no default
# Create a data frame
loan_data <- data.frame(credit_score, income, loan_amount, loan_term, default = as.factor(default))
# Fit a logistic regression model
logit_model <- glm(default ~ credit_score + income + loan_amount + loan_term, data = loan_data, family = binomial)
# Make predictions using the logistic regression model
loan_data$predicted_prob <- predict(logit_model, type = "response")
loan_data$predicted_class <- ifelse(loan_data$predicted_prob > 0.5, 1, 0)
# Visualizing the predictions based on credit score and income
ggplot(loan_data, aes(x = credit_score, y = income, color = as.factor(predicted_class))) +
geom_point(size = 3) +
labs(title = "Loan Default Prediction: Credit Score vs Income",
x = "Credit Score", y = "Income", color = "Predicted Class") +
scale_color_manual(values = c("red", "green")) +
theme_minimal()
# Display the logistic regression model summary
cat("Logistic Regression Model Summary:\n")
summary(logit_model)
# Print the current date for reference
cat("Date of execution:", Sys.Date(), "\n")