# File Name: principal_component_analysis.R
# Author: Gerard King - www.gerardking.dev
# Title: Principal Component Analysis (PCA) and Visualization
# Description: This R program generates a synthetic dataset, applies PCA to reduce the data's
# dimensionality, and visualizes the first two principal components. PCA is widely used for
# feature extraction and data visualization in high-dimensional datasets.
# Use Cases:
# - Reducing the dimensionality of high-dimensional data
# - Visualizing high-dimensional datasets in lower dimensions
# - Identifying patterns or structures in complex data
# Audience:
# - Data scientists and machine learning practitioners
# - Students learning dimensionality reduction techniques
# - Researchers analyzing multivariate datasets
# Blue Team Uses:
# - Reducing system performance metrics (e.g., CPU, memory, network) to simpler features
# - Visualizing high-dimensional logs or system data for easier monitoring and anomaly detection
# - Identifying patterns in complex security data
# Red Team Uses:
# - Reducing attack simulation data to identify patterns and critical features
# - Using PCA to visualize attack data in lower dimensions for easier analysis
# - Identifying the most critical variables in simulated attacks to help in prioritizing security measures
# Current Date: 2025-03-06
# Load necessary library
library(ggplot2)
# Set the seed for reproducibility
set.seed(303)
# Generate synthetic multivariate data with 5 features
data <- data.frame(
var1 = rnorm(100),
var2 = rnorm(100),
var3 = rnorm(100),
var4 = rnorm(100),
var5 = rnorm(100)
)
# Apply Principal Component Analysis (PCA)
pca_result <- prcomp(data, center = TRUE, scale. = TRUE)
# Summary of the PCA to see variance explained by each principal component
cat("Summary of PCA:\n")
print(summary(pca_result))
# Create a data frame with the first two principal components
pca_data <- as.data.frame(pca_result$x[, 1:2])
# Plot the first two principal components
ggplot(pca_data, aes(x = PC1, y = PC2)) +
geom_point(color = "blue", size = 3) +
labs(title = "PCA: First Two Principal Components",
x = "Principal Component 1", y = "Principal Component 2") +
theme_minimal()
# Display the explained variance (proportion of variance) for the first two PCs
cat("\nProportion of Variance Explained by the First Two Principal Components:\n")
print(summary(pca_result)$importance[2, 1:2])
# Print the current date for reference
cat("Date of execution:", Sys.Date(), "\n")