Introduction

The STPGA (Subset Training Population Genetic Algorithm) package provides advanced tools for optimizing training set selection in genomic prediction and experimental design. This vignette demonstrates the package’s key functionalities using real wheat genomic data.

Key Features

Modern optimization criteria: A-, D-, E-optimality, PEV, Cook’s Distance
Mixed model support: BLUP-based criteria for genomic selection
Genetic algorithms: Single and multi-objective optimization
Flexible interface: Unified criterion function with legacy compatibility
Robust numerics: Ridge regularization and numerical stability

Getting Started

Let’s start by loading the package and exploring the included wheat dataset:

library(STPGA)

# Load the wheat dataset
data("WheatData")

# Explore the data structure
cat("Dataset components:\n")
#> Dataset components:
cat("- Wheat.Y: Phenotype data (", nrow(Wheat.Y), " observations)\n")
#> - Wheat.Y: Phenotype data ( 200  observations)
cat("- Wheat.M: Marker data (", nrow(Wheat.M), " individuals x ", ncol(Wheat.M), " markers)\n")
#> - Wheat.M: Marker data ( 200  individuals x  4670  markers)
cat("- Wheat.K: Kinship matrix (", nrow(Wheat.K), " x ", ncol(Wheat.K), ")\n")
#> - Wheat.K: Kinship matrix ( 200  x  200 )

# Look at the phenotype data
head(Wheat.Y)
#>               id plant.height
#> 1846 10542-63/87        91.44
#> 250         131A       124.46
#> 1541        1548       129.54
#> 1516      155/71       124.46
#> 508         1664       134.62
#> 1549        1666       144.78

# Check marker data dimensions and first few markers
cat("Marker matrix preview:\n")
#> Marker matrix preview:
Wheat.M[1:5, 1:5]
#>            IWA1 IWA2 IWA3 IWA4 IWA5
#> IWA8610266    1    1   -1   -1   -1
#> IWA8606816    1    1    1   -1   -1
#> 3883          1    1    1   -1   -1
#> NW86A         1    1    1    1   -1
#> PI345476      1    1    1   -1   -1

# Examine kinship matrix structure
cat("Kinship matrix diagonal (first 10):\n")
#> Kinship matrix diagonal (first 10):
diag(Wheat.K)[1:10]
#> IWA8610266 IWA8606816       3883      NW86A   PI345476    H86-708 IWA8602323 
#>   1.957836   2.003338   1.980769   2.281661   1.808700   1.047744   1.970056 
#>  237-VII/2       1666 IWA8610164 
#>   1.760409   2.084154   1.983354
cat("Kinship matrix off-diagonal (first 5x5):\n")
#> Kinship matrix off-diagonal (first 5x5):
Wheat.K[1:5, 1:5]
#>            IWA8610266 IWA8606816       3883      NW86A   PI345476
#> IWA8610266  1.9578361  0.8261588  0.7546713  0.5149389 -0.2697654
#> IWA8606816  0.8261588  2.0033376  0.5756170  0.5241991 -0.2315324
#> 3883        0.7546713  0.5756170  1.9807687  0.7077443 -0.2888690
#> NW86A       0.5149389  0.5241991  0.7077443  2.2816612 -0.2645939
#> PI345476   -0.2697654 -0.2315324 -0.2888690 -0.2645939  1.8086996

Optimization Criteria

STPGA provides several optimization criteria for training set selection. Let’s demonstrate each using the wheat data.

Setup for Demonstrations

# Define candidate pool and test sets
all_individuals <- rownames(Wheat.M)
n_total <- length(all_individuals)

# Create training candidates and test set
set.seed(123)
test_set <- sample(all_individuals, 40)
candidates <- setdiff(all_individuals, test_set)

cat("Total individuals:", n_total, "\n")
#> Total individuals: 200
cat("Test set size:", length(test_set), "\n")
#> Test set size: 40
cat("Candidate pool size:", length(candidates), "\n")
#> Candidate pool size: 160

# Extract first 5 principal components for non-mixed model criteria
# This reduces computational complexity for high-dimensional marker data
cat("\nExtracting first 5 principal components for non-mixed model criteria...\n")
#> 
#> Extracting first 5 principal components for non-mixed model criteria...
pca_result <- prcomp(Wheat.M, center = TRUE, scale. = TRUE)
Wheat.PC5 <- pca_result$x[, 1:5]
rownames(Wheat.PC5) <- rownames(Wheat.M)

cat("PC matrix dimensions:", nrow(Wheat.PC5), "x", ncol(Wheat.PC5), "\n")
#> PC matrix dimensions: 200 x 5
cat("Variance explained by first 5 PCs:", sprintf("%.2f%%", 100 * sum(pca_result$sdev[1:5]^2) / sum(pca_result$sdev^2)), "\n")
#> Variance explained by first 5 PCs: 30.65%

Classical Optimality Criteria

The package implements the classical A-, D-, and E-optimality criteria from experimental design theory.

# Select training sets of different sizes
train_small <- sample(candidates, 30)
train_medium <- sample(candidates, 60)
train_large <- sample(candidates, 100)

# Compute A-optimality using first 5 PCs (minimizes average prediction variance)
aopt_small <- a_optimality(train_small, test_set, Wheat.PC5)
aopt_medium <- a_optimality(train_medium, test_set, Wheat.PC5)
aopt_large <- a_optimality(train_large, test_set, Wheat.PC5)

cat("A-optimality results:\n")
#> A-optimality results:
cat("Small training (n=30):", sprintf("%.2e", aopt_small), "\n")
#> Small training (n=30): 1.37e-03
cat("Medium training (n=60):", sprintf("%.2e", aopt_medium), "\n")
#> Medium training (n=60): 5.26e-04
cat("Large training (n=100):", sprintf("%.2e", aopt_large), "\n")
#> Large training (n=100): 3.01e-04
cat("Improvement (large vs small):", sprintf("%.1fx", aopt_small/aopt_large), "\n")
#> Improvement (large vs small): 4.5x

# D-optimality using first 5 PCs (minimizes generalized variance)
dopt_small <- d_optimality(train_small, test_set, Wheat.PC5)
dopt_medium <- d_optimality(train_medium, test_set, Wheat.PC5)
dopt_large <- d_optimality(train_large, test_set, Wheat.PC5)

cat("\nD-optimality results:\n")
#> 
#> D-optimality results:
cat("Small training (n=30):", sprintf("%.2f", dopt_small), "\n")
#> Small training (n=30): 42.80
cat("Medium training (n=60):", sprintf("%.2f", dopt_medium), "\n")
#> Medium training (n=60): 47.03
cat("Large training (n=100):", sprintf("%.2f", dopt_large), "\n")
#> Large training (n=100): 49.77

# E-optimality using first 5 PCs (minimizes maximum variance)
eopt_small <- e_optimality(train_small, test_set, Wheat.PC5)
eopt_medium <- e_optimality(train_medium, test_set, Wheat.PC5)
eopt_large <- e_optimality(train_large, test_set, Wheat.PC5)

cat("\nE-optimality results:\n")
#> 
#> E-optimality results:
cat("Small training (n=30):", sprintf("%.2f", eopt_small), "\n")
#> Small training (n=30): 10.06
cat("Medium training (n=60):", sprintf("%.2f", eopt_medium), "\n")
#> Medium training (n=60): 10.69
cat("Large training (n=100):", sprintf("%.2f", eopt_large), "\n")
#> Large training (n=100): 11.35

Prediction Error Variance (PEV) Criteria

PEV criteria are particularly relevant for prediction accuracy in genomic selection.

# Mean PEV using first 5 PCs
pev_mean_small <- pev_mean(train_small, test_set, Wheat.PC5)
pev_mean_large <- pev_mean(train_large, test_set, Wheat.PC5)

# Normalized PEV using first 5 PCs (scale-invariant)
pev_norm_small <- pev_mean(train_small, test_set, Wheat.PC5, normalized = TRUE)
pev_norm_large <- pev_mean(train_large, test_set, Wheat.PC5, normalized = TRUE)

# Maximum PEV using first 5 PCs (worst-case scenario)
pev_max_small <- pev_max(train_small, test_set, Wheat.PC5)
pev_max_large <- pev_max(train_large, test_set, Wheat.PC5)

cat("PEV Results:\n")
#> PEV Results:
cat("Mean PEV (small):", sprintf("%.2e", pev_mean_small), "\n")
#> Mean PEV (small): 1.24e+00
cat("Mean PEV (large):", sprintf("%.2e", pev_mean_large), "\n")
#> Mean PEV (large): 1.05e+00
cat("Improvement:", sprintf("%.1fx", pev_mean_small/pev_mean_large), "\n")
#> Improvement: 1.2x

cat("\nNormalized PEV:\n")
#> 
#> Normalized PEV:
cat("Small training:", sprintf("%.6f", pev_norm_small), "\n")
#> Small training: 0.025000
cat("Large training:", sprintf("%.6f", pev_norm_large), "\n")
#> Large training: 0.025000

cat("\nMax PEV:\n")
#> 
#> Max PEV:
cat("Small training:", sprintf("%.2e", pev_max_small), "\n")
#> Small training: 1.76e+00
cat("Large training:", sprintf("%.2e", pev_max_large), "\n")
#> Large training: 1.20e+00
cat("Max/Mean ratio (small):", sprintf("%.2f", pev_max_small/pev_mean_small), "\n")
#> Max/Mean ratio (small): 1.42
cat("Max/Mean ratio (large):", sprintf("%.2f", pev_max_large/pev_mean_large), "\n")
#> Max/Mean ratio (large): 1.14

Coefficient of Determination (R²) Criteria

The Coefficient of Determination measures the proportion of variance explained by the model.

# Mean Coefficient of Determination (R²) using first 5 PCs
cd_mean_small <- cd_mean(train_small, test_set, Wheat.PC5)
cd_mean_large <- cd_mean(train_large, test_set, Wheat.PC5)

# Normalized Coefficient of Determination using first 5 PCs
cd_norm_small <- cd_mean(train_small, test_set, Wheat.PC5, normalized = TRUE)
cd_norm_large <- cd_mean(train_large, test_set, Wheat.PC5, normalized = TRUE)

cat("Coefficient of Determination Results:\n")
#> Coefficient of Determination Results:
cat("Mean CD (small):", sprintf("%.2e", cd_mean_small), "\n")
#> Mean CD (small): 2.42e-01
cat("Mean CD (large):", sprintf("%.2e", cd_mean_large), "\n")
#> Mean CD (large): 5.26e-02

cat("\nNormalized R²:\n")
#> 
#> Normalized R²:
cat("Small training:", sprintf("%.6f", cd_norm_small), "\n")
#> Small training: 0.025000
cat("Large training:", sprintf("%.6f", cd_norm_large), "\n")
#> Large training: 0.025000

Mixed Model Criteria for Genomic Selection

For genomic selection applications, the package provides mixed model criteria that account for genetic relationships.

# Select smaller sets for computational efficiency with mixed models
train_mm <- sample(candidates, 50)
test_mm <- sample(test_set, 20)

# Extract relevant kinship submatrices
K_train <- Wheat.K[train_mm, train_mm]
K_test <- Wheat.K[test_mm, test_mm]
K_test_train <- Wheat.K[test_mm, train_mm]

# Mixed model PEV with different heritability scenarios
h2_high <- list(Vg = 0.8, Ve = 0.2)  # High heritability (80%)
h2_low <- list(Vg = 0.3, Ve = 0.7)   # Low heritability (30%)

# Use smaller dataset for computational efficiency
# For mixed models, use subset of individuals but all relationships
all_mm_individuals <- c(train_mm, test_mm)
P_subset <- Wheat.M[all_mm_individuals, 1:100]  # Use only first 100 markers
K_subset <- Wheat.K[all_mm_individuals, all_mm_individuals]

# Compute mixed model criteria with smaller dataset
pev_mm_high <- pev_mean_mm(train_mm, test_mm, P_subset, K_subset, 
                           Vg = h2_high$Vg, Ve = h2_high$Ve)
#> Warning in validate_kinship_matrix(K, rownames(P)): Kinship matrix diagonal
#> values outside expected range [0.5, 2]
#> Warning in pev_mean_mm(train_mm, test_mm, P_subset, K_subset, Vg = h2_high$Vg,
#> : Negative PEV values detected. This may indicate numerical issues.
pev_mm_low <- pev_mean_mm(train_mm, test_mm, P_subset, K_subset, 
                          Vg = h2_low$Vg, Ve = h2_low$Ve)
#> Warning in validate_kinship_matrix(K, rownames(P)): Kinship matrix diagonal
#> values outside expected range [0.5, 2]
#> Warning in pev_mean_mm(train_mm, test_mm, P_subset, K_subset, Vg = h2_low$Vg, :
#> Negative PEV values detected. This may indicate numerical issues.

cd_mm_high <- cd_mean_mm(train_mm, test_mm, P_subset, K_subset, 
                         Vg = h2_high$Vg, Ve = h2_high$Ve)
#> Warning in validate_kinship_matrix(K, rownames(P)): Kinship matrix diagonal
#> values outside expected range [0.5, 2]
#> Warning in pev_mean_mm(train, test, P, K, lambda, C, Vg, Ve): Negative PEV
#> values detected. This may indicate numerical issues.
cd_mm_low <- cd_mean_mm(train_mm, test_mm, P_subset, K_subset, 
                        Vg = h2_low$Vg, Ve = h2_low$Ve)
#> Warning in validate_kinship_matrix(K, rownames(P)): Kinship matrix diagonal
#> values outside expected range [0.5, 2]
#> Warning in validate_kinship_matrix(K, rownames(P)): Negative PEV values
#> detected. This may indicate numerical issues.

cat("Mixed Model Results:\n")
#> Mixed Model Results:
cat("PEV (h² = 0.8):", sprintf("%.6f", pev_mm_high), "\n")
#> PEV (h² = 0.8): 0.429349
cat("PEV (h² = 0.3):", sprintf("%.6f", pev_mm_low), "\n")
#> PEV (h² = 0.3): 0.044506
cat("Heritability effect:", sprintf("%.2fx", pev_mm_low/pev_mm_high), "\n")
#> Heritability effect: 0.10x

cat("\nCoefficient of Determination (mixed model):\n")
#> 
#> Coefficient of Determination (mixed model):
cat("CD (h² = 0.8):", sprintf("%.6f", cd_mm_high), "\n")
#> CD (h² = 0.8): 0.463314
cat("CD (h² = 0.3):", sprintf("%.6f", cd_mm_low), "\n")
#> CD (h² = 0.3): 0.851648

Unified Criterion Interface

The criterion() function provides a unified interface to all optimization criteria, supporting both modern and legacy names.

# Define a medium-sized training set
train_unified <- sample(candidates, 60)

# Test all modern criteria
modern_criteria <- c("a_optimality", "d_optimality", "e_optimality", 
                     "pev_mean", "pev_max", "cd_mean")

cat("Modern Criteria Results:\n")
#> Modern Criteria Results:
cat("========================\n")
#> ========================
for (crit in modern_criteria) {
  value <- criterion(train_unified, test_set, Wheat.PC5, criterion = crit)
  cat(sprintf("%-15s: %12.6e\n", crit, value))
}
#> a_optimality   : 4.961210e-04
#> d_optimality   : 4.721829e+01
#> e_optimality   : 1.078261e+01
#> pev_mean       : 1.090311e+00
#> pev_max        : 1.290717e+00
#> cd_mean        : 9.031103e-02

# Test normalized versions
cat("\nNormalized Criteria:\n")
#> 
#> Normalized Criteria:
cat("===================\n")
#> ===================
pev_norm <- criterion(train_unified, test_set, Wheat.PC5, criterion = "pev_mean_normalized")
cd_norm <- criterion(train_unified, test_set, Wheat.PC5, criterion = "cd_mean_normalized")
cat(sprintf("%-20s: %12.6f\n", "pev_mean_normalized", pev_norm))
#> pev_mean_normalized :     0.025000
cat(sprintf("%-20s: %12.6f\n", "cd_mean_normalized", cd_norm))
#> cd_mean_normalized  :     0.025000

# Test mixed model criteria (using subset for speed)
cat("\nMixed Model Criteria (100 markers):\n")
#> 
#> Mixed Model Criteria (100 markers):
cat("===================================\n")
#> ===================================
P_unified_subset <- Wheat.M[c(train_unified, test_set), 1:100]
K_unified_subset <- Wheat.K[c(train_unified, test_set), c(train_unified, test_set)]
pev_mm <- criterion(train_unified, test_set, P_unified_subset, criterion = "pev_mean_mm", K = K_unified_subset)
#> Warning in validate_kinship_matrix(K, rownames(P)): Kinship matrix diagonal
#> values outside expected range [0.5, 2]
#> Warning in pev_mean_mm(train, test, P, K, lambda, C, Vg, Ve): Negative PEV
#> values detected. This may indicate numerical issues.
cd_mm <- criterion(train_unified, test_set, P_unified_subset, criterion = "cd_mean_mm", K = K_unified_subset)
#> Warning in validate_kinship_matrix(K, rownames(P)): Kinship matrix diagonal
#> values outside expected range [0.5, 2]
#> Warning in validate_kinship_matrix(K, rownames(P)): Negative PEV values
#> detected. This may indicate numerical issues.
cat(sprintf("%-15s: %12.6e\n", "pev_mean_mm", pev_mm))
#> pev_mean_mm    : 7.861777e-01
cat(sprintf("%-15s: %12.6e\n", "cd_mean_mm", cd_mm))
#> cd_mean_mm     : 2.138223e-01

# Demonstrate legacy compatibility
cat("\nLegacy Compatibility:\n")
#> 
#> Legacy Compatibility:
cat("====================\n")
#> ====================
aopt_modern <- criterion(train_unified, test_set, Wheat.PC5, criterion = "a_optimality")
aopt_legacy <- criterion(train_unified, test_set, Wheat.PC5, criterion = "AOPT")
cat("Modern a_optimality:", sprintf("%.6e", aopt_modern), "\n")
#> Modern a_optimality: 4.961210e-04
cat("Legacy AOPT:        ", sprintf("%.6e", aopt_legacy), "\n")
#> Legacy AOPT:         4.961210e-04
cat("Values match:", aopt_modern == aopt_legacy, "\n")
#> Values match: TRUE

Genetic Algorithm Optimization

STPGA’s core functionality is optimizing training set selection using genetic algorithms with advanced convergence detection and selection mechanisms.

Enhanced Genetic Algorithm Features (v7.0.0)

The latest version includes significant improvements to the genetic algorithm implementation:

Multi-criteria convergence detection: Monitors fitness stabilization, improvement rate, population diversity, and fitness variance
Restart mechanisms: Automatically restarts with diversified populations when premature convergence is detected
Advanced selection methods: Tournament, elite, rank-based, and hybrid selection with validation
Configurable convergence windows: Flexible convergence tracking with diagnostic tools

Single-Objective Optimization with Advanced Features

# Define optimization parameters
n_select <- 40  # Number of individuals to select
n_candidates <- length(candidates)

cat("Enhanced Genetic Algorithm Optimization\n")
#> Enhanced Genetic Algorithm Optimization
cat("======================================\n")
#> ======================================
cat("Candidate pool size:", n_candidates, "\n")
#> Candidate pool size: 160
cat("Selection target:", n_select, "\n")
#> Selection target: 40
cat("Test set size:", length(test_set), "\n")
#> Test set size: 40

# Demonstrate the genetic algorithm with advanced features
set.seed(456)

# Basic genetic algorithm run
cat("\nRunning genetic algorithm with default settings...\n")
#> 
#> Running genetic algorithm with default settings...
ga_basic <- subset_ga(
  P = Wheat.PC5,
  Candidates = candidates,
  Test = test_set,
  ntoselect = n_select,
  npop = 30,
  niterations = 50,
  criterion = "pev_mean",
  verbose = FALSE
)

cat("Basic GA Results:\n")
#> Basic GA Results:
cat("Best fitness:", sprintf("%.8f", ga_basic$best_fitness), "\n")
#> Best fitness: 1.09824362
cat("Convergence generation:", ga_basic$convergence_generation, "\n")
#> Convergence generation: 50
cat("Selection method:", ga_basic$parameters$selection_method, "\n")
#> Selection method: tournament

# Enhanced genetic algorithm with rank-based selection and restart
cat("\nRunning enhanced GA with rank-based selection and restart...\n")
#> 
#> Running enhanced GA with rank-based selection and restart...
ga_enhanced <- subset_ga(
  P = Wheat.PC5,
  Candidates = candidates,
  Test = test_set,
  ntoselect = n_select,
  npop = 30,
  niterations = 50,
  criterion = "pev_mean",
  selection_method = "rank",
  selection_pressure = 1.4,
  enable_restart = TRUE,
  restart_threshold = 0.6,
  max_restarts = 1,
  convergence_window_multiplier = 3,
  verbose = FALSE
)

cat("Enhanced GA Results:\n")
#> Enhanced GA Results:
cat("Best fitness:", sprintf("%.8f", ga_enhanced$best_fitness), "\n")
#> Best fitness: 1.10192184
cat("Convergence generation:", ga_enhanced$convergence_generation, "\n")
#> Convergence generation: 50
cat("Total generations:", ga_enhanced$total_generations, "\n")
#> Total generations: 50
cat("Restart count:", ga_enhanced$restart_count, "\n")
#> Restart count: 0
cat("Selection method:", ga_enhanced$parameters$selection_method, "\n")
#> Selection method: rank
cat("Selection pressure:", ga_enhanced$parameters$selection_pressure, "\n")
#> Selection pressure: 1.4

# Compare with random selection
random_selection <- sample(candidates, n_select)
random_fitness <- pev_mean(random_selection, test_set, Wheat.PC5, normalized = TRUE)

cat("\nComparison Results:\n")
#> 
#> Comparison Results:
cat("Random selection PEV:", sprintf("%.8f", random_fitness), "\n")
#> Random selection PEV: 0.02500000
cat("Basic GA PEV:       ", sprintf("%.8f", ga_basic$best_fitness), "\n")
#> Basic GA PEV:        1.09824362
cat("Enhanced GA PEV:    ", sprintf("%.8f", ga_enhanced$best_fitness), "\n")
#> Enhanced GA PEV:     1.10192184

basic_improvement <- (random_fitness - ga_basic$best_fitness) / random_fitness * 100
enhanced_improvement <- (random_fitness - ga_enhanced$best_fitness) / random_fitness * 100

cat("Basic GA improvement:", sprintf("%.2f%%", basic_improvement), "\n")
#> Basic GA improvement: -4292.97%
cat("Enhanced GA improvement:", sprintf("%.2f%%", enhanced_improvement), "\n")
#> Enhanced GA improvement: -4307.69%

Convergence Diagnostics

The enhanced genetic algorithm provides comprehensive convergence diagnostics to help users understand optimization behavior.

# Generate convergence diagnostics for the enhanced GA
cat("Convergence Diagnostics for Enhanced GA:\n")
#> Convergence Diagnostics for Enhanced GA:
cat("=======================================\n")
#> =======================================

diags <- convergence_diagnostics(ga_enhanced, plot = FALSE)

cat("Convergence Status:\n")
#> Convergence Status:
cat("- Converged:", diags$convergence_status$converged, "\n")
#> - Converged: FALSE
cat("- Reason:", diags$convergence_status$reason, "\n")
#> - Reason: convergence criteria not met
cat("- Window size:", diags$window_size, "\n")
#> - Window size: 15
cat("- Data points:", diags$data_points, "\n")
#> - Data points: 15

cat("\nConvergence Metrics:\n")
#> 
#> Convergence Metrics:
cat("- Fitness stability:", sprintf("%.6f", diags$metrics$fitness_stability), "\n")
#> - Fitness stability: 0.004162
cat("- Diversity stability:", sprintf("%.6f", diags$metrics$diversity_stability), "\n")
#> - Diversity stability: 0.001903
cat("- Mean improvement rate:", sprintf("%.6f", diags$metrics$mean_improvement_rate), "\n")
#> - Mean improvement rate: 0.004157
cat("- Variance stability:", sprintf("%.6f", diags$metrics$variance_stability), "\n")
#> - Variance stability: 0.007142

if (length(diags$recommendations) > 0) {
  cat("\nRecommendations:\n")
  for (i in seq_along(diags$recommendations)) {
    cat(paste("-", diags$recommendations[i], "\n"))
  }
} else {
  cat("\nNo specific recommendations - optimization appears satisfactory.\n")
}
#> 
#> No specific recommendations - optimization appears satisfactory.

# Show restart history if any restarts occurred
if (ga_enhanced$restart_count > 0) {
  cat("\nRestart History:\n")
  for (i in 1:ga_enhanced$restart_count) {
    restart_info <- ga_enhanced$restart_history[[i]]
    cat(sprintf("Restart %d: Generation %d, Fitness %.6f, Reason: %s\n",
                i, restart_info$generation, restart_info$best_fitness,
                restart_info$convergence_reason))
  }
}

# Plot GA convergence comparison
par(mfrow = c(1, 2))

# Plot fitness history for both algorithms
plot(1:nrow(ga_basic$fitness_history), ga_basic$fitness_history[, "best"],
     type = "l", col = "blue", lwd = 2,
     xlab = "Generation", ylab = "Best Fitness",
     main = "GA Convergence Comparison")
lines(1:nrow(ga_enhanced$fitness_history), ga_enhanced$fitness_history[, "best"],
      col = "red", lwd = 2)
grid(TRUE)
legend("topright", legend = c("Basic GA", "Enhanced GA"), 
       col = c("blue", "red"), lwd = 2)

# Plot population diversity if available
if ("diversity" %in% names(ga_enhanced$population_stats[[1]])) {
  basic_diversity <- sapply(ga_basic$population_stats, function(x) x$diversity)
  enhanced_diversity <- sapply(ga_enhanced$population_stats, function(x) x$diversity)
  
  plot(1:length(basic_diversity), basic_diversity,
       type = "l", col = "blue", lwd = 2,
       xlab = "Generation", ylab = "Population Diversity",
       main = "Population Diversity Over Time")
  lines(1:length(enhanced_diversity), enhanced_diversity, col = "red", lwd = 2)
  grid(TRUE)
  legend("topright", legend = c("Basic GA", "Enhanced GA"), 
         col = c("blue", "red"), lwd = 2)
}


par(mfrow = c(1, 1))

Multi-Objective Optimization

STPGA supports multi-objective optimization to find Pareto-optimal solutions.

# Multi-objective demonstration: evaluate trade-offs between criteria
cat("Multi-Objective Analysis\n")
#> Multi-Objective Analysis
cat("========================\n")
#> ========================

# Generate several solutions and evaluate on multiple criteria
set.seed(789)
n_solutions <- 8
solutions <- list()
pev_values <- numeric(n_solutions)
cd_values <- numeric(n_solutions)

for (i in 1:n_solutions) {
  solutions[[i]] <- sample(candidates, n_select)
  pev_values[i] <- pev_mean(solutions[[i]], test_set, Wheat.PC5, normalized = TRUE)
  cd_values[i] <- cd_mean(solutions[[i]], test_set, Wheat.PC5, normalized = TRUE)
}

cat("Multi-criteria evaluation:\n")
#> Multi-criteria evaluation:
cat("Solution | PEV Mean | CD Mean\n")
#> Solution | PEV Mean | CD Mean
cat("---------|----------|--------\n")
#> ---------|----------|--------
for (i in 1:n_solutions) {
  cat(sprintf("%8d | %8.6f | %7.6f\n", i, pev_values[i], cd_values[i]))
}
#>        1 | 0.025000 | 0.025000
#>        2 | 0.025000 | 0.025000
#>        3 | 0.025000 | 0.025000
#>        4 | 0.025000 | 0.025000
#>        5 | 0.025000 | 0.025000
#>        6 | 0.025000 | 0.025000
#>        7 | 0.025000 | 0.025000
#>        8 | 0.025000 | 0.025000

# Find solutions that are good for both criteria
combined_rank <- rank(pev_values) + rank(cd_values)
best_combined <- which.min(combined_rank)

cat("\nBest combined solution (lowest sum of ranks):", best_combined, "\n")
#> 
#> Best combined solution (lowest sum of ranks): 5
cat("PEV:", sprintf("%.6f", pev_values[best_combined]), "\n")
#> PEV: 0.025000
cat("CD: ", sprintf("%.6f", cd_values[best_combined]), "\n")
#> CD:  0.025000

# Plot the trade-off between criteria
if (length(pev_values) > 1 && length(cd_values) > 1) {
  plot(pev_values, cd_values,
       pch = 19, col = "blue", cex = 1.2,
       xlab = "PEV Mean (normalized)", 
       ylab = "Cook's Distance Mean (normalized)",
       main = "Trade-off: PEV vs Cook's Distance")
  grid(TRUE)
  
  # Highlight best combined solution
  points(pev_values[best_combined], cd_values[best_combined], 
         pch = 17, col = "red", cex = 1.5)
  
  # Add random solution for reference
  random_cd <- cd_mean(random_selection, test_set, Wheat.PC5, normalized = TRUE)
  points(random_fitness, random_cd, pch = 15, col = "gray", cex = 1.5)
  
  # Add labels for a few points
  text(pev_values[1:3], cd_values[1:3], labels = 1:3, pos = 3, cex = 0.8)
  
  legend("topright", 
         legend = c("Solutions", "Best Combined", "Random"),
         pch = c(19, 17, 15),
         col = c("blue", "red", "gray"))
}

Advanced Features

Ridge Regularization Effects

The package uses ridge regularization for numerical stability. Let’s examine its effects:

# Test different ridge parameters
lambda_values <- c(1e-8, 1e-6, 1e-4, 1e-2)
train_ridge <- sample(candidates, 50)

cat("Ridge Regularization Effects:\n")
#> Ridge Regularization Effects:
cat("=============================\n")
#> =============================
cat("Lambda\t\tA-optimality\tPEV Mean\n")
#> Lambda       A-optimality    PEV Mean
cat("------\t\t------------\t--------\n")
#> ------       ------------    --------

for (lambda in lambda_values) {
  aopt_ridge <- a_optimality(train_ridge, test_set, Wheat.PC5, lambda = lambda)
  pev_ridge <- pev_mean(train_ridge, test_set, Wheat.PC5, lambda = lambda)
  cat(sprintf("%.0e\t\t%.6e\t%.6e\n", lambda, aopt_ridge, pev_ridge))
}
#> 1e-08        7.821194e-04    1.126249e+00
#> 1e-06        7.821194e-04    1.126249e+00
#> 1e-04        7.821193e-04    1.126249e+00
#> 1e-02        7.821172e-04    1.126249e+00

Matrix Stability and Conditioning

# Demonstrate numerical stability features
train_stability <- sample(candidates, 80)

# Check matrix conditioning using first 5 PCs
P_train <- Wheat.PC5[train_stability, ]
xtx <- crossprod(P_train)
condition_number <- kappa(xtx)

cat("Matrix Conditioning:\n")
#> Matrix Conditioning:
cat("===================\n")
#> ===================
cat("Training set size:", length(train_stability), "\n")
#> Training set size: 80
cat("Number of principal components:", ncol(Wheat.PC5), "\n")
#> Number of principal components: 5
cat("X'X condition number:", sprintf("%.2e", condition_number), "\n")
#> X'X condition number: 7.67e+00

if (condition_number > 1e12) {
  cat("Matrix is ill-conditioned - ridge regularization recommended\n")
} else {
  cat("Matrix is well-conditioned\n")
}
#> Matrix is well-conditioned

# Test with extreme case (very small training set)
train_extreme <- sample(candidates, 10)  # Very small training set
P_extreme <- Wheat.PC5[train_extreme, ]  # Use all 5 PCs
xtx_extreme <- crossprod(P_extreme)
condition_extreme <- kappa(xtx_extreme)

cat("\nExtreme case (n << p):\n")
#> 
#> Extreme case (n << p):
cat("Training set size:", nrow(P_extreme), "\n")
#> Training set size: 10
cat("Number of predictors:", ncol(P_extreme), "\n")
#> Number of predictors: 5
cat("Condition number:", sprintf("%.2e", condition_extreme), "\n")
#> Condition number: 9.53e+01

Practical Guidelines

Training Set Size Recommendations

# Analyze the relationship between training set size and prediction quality
sizes <- seq(20, 120, by = 20)
pev_by_size <- numeric(length(sizes))

cat("Training Set Size Analysis:\n")
#> Training Set Size Analysis:
cat("===========================\n")
#> ===========================

set.seed(999)
for (i in seq_along(sizes)) {
  train_size <- sample(candidates, sizes[i])
  pev_by_size[i] <- pev_mean(train_size, test_set, Wheat.PC5, normalized = TRUE)
}

# Create recommendations table
cat("Size\tNormalized PEV\tRelative to n=20\n")
#> Size Normalized PEV  Relative to n=20
cat("----\t--------------\t----------------\n")
#> ---- --------------  ----------------
for (i in seq_along(sizes)) {
  relative <- pev_by_size[1] / pev_by_size[i]
  cat(sprintf("%4d\t%12.6f\t%12.2fx\n", sizes[i], pev_by_size[i], relative))
}
#>   20     0.025000            1.00x
#>   40     0.025000            1.00x
#>   60     0.025000            1.00x
#>   80     0.025000            1.00x
#>  100     0.025000            1.00x
#>  120     0.025000            1.00x

# Plot the relationship
plot(sizes, pev_by_size, type = "b", pch = 19, col = "darkblue", lwd = 2,
     xlab = "Training Set Size", ylab = "Normalized PEV",
     main = "Effect of Training Set Size on Prediction Quality")
grid(TRUE)

# Add trend line
lm_fit <- lm(pev_by_size ~ I(1/sizes))
lines(sizes, predict(lm_fit), col = "red", lty = 2, lwd = 2)

# Add recommendations
abline(v = 60, col = "green", lty = 3)
text(65, max(pev_by_size) * 0.9, "Recommended\nMinimum", col = "green", pos = 4)

Criterion Selection Guide

# Compare all criteria on the same training set for guidance
train_guide <- sample(candidates, 60)

criteria_comparison <- data.frame(
  Criterion = c("A-optimality", "D-optimality", "E-optimality", 
                "PEV Mean", "PEV Max", "Cook's Distance"),
  Modern_Name = c("a_optimality", "d_optimality", "e_optimality",
                  "pev_mean", "pev_max", "cd_mean"),
  Use_Case = c("Average prediction variance", "Generalized variance",
               "Maximum variance", "Prediction accuracy", 
               "Worst-case prediction", "Influential observations"),
  stringsAsFactors = FALSE
)

cat("Criterion Selection Guide:\n")
#> Criterion Selection Guide:
cat("=========================\n")
#> =========================
for (i in 1:nrow(criteria_comparison)) {
  cat(sprintf("%-15s: %s\n", 
              criteria_comparison$Criterion[i], 
              criteria_comparison$Use_Case[i]))
}
#> A-optimality   : Average prediction variance
#> D-optimality   : Generalized variance
#> E-optimality   : Maximum variance
#> PEV Mean       : Prediction accuracy
#> PEV Max        : Worst-case prediction
#> Cook's Distance: Influential observations

cat("\nFor genomic selection, we recommend:\n")
#> 
#> For genomic selection, we recommend:
cat("- pev_mean_normalized: General prediction accuracy\n")
#> - pev_mean_normalized: General prediction accuracy
cat("- pev_mean_mm: When kinship information is available\n")
#> - pev_mean_mm: When kinship information is available
cat("- Multi-objective: Balance multiple criteria\n")
#> - Multi-objective: Balance multiple criteria

Conclusion

The STPGA package provides a comprehensive toolkit for optimizing training set selection in genomic prediction and experimental design. Key features include:

Modern API: Clean, intuitive function names with comprehensive documentation
Multiple Criteria: Classical optimality criteria plus prediction-focused measures
Mixed Models: Support for genomic selection with kinship matrices
Genetic Algorithms: Efficient single and multi-objective optimization
Numerical Stability: Ridge regularization and robust matrix operations
Flexibility: Unified interface with legacy compatibility

Best Practices

Criterion Selection

Use pev_mean_normalized for general prediction accuracy optimization
Consider pev_mean_mm when genetic relationships are available
Apply multi-objective optimization when balancing multiple criteria
Use ridge regularization (λ ≥ 1e-6) for numerical stability

Training Set Size

Select training sets of at least 30-50% of the candidate pool
For genomic selection, aim for training sets ≥ 100 individuals when possible
Balance computational cost with prediction accuracy

Genetic Algorithm Configuration

Selection Methods:
- Use "tournament" for balanced exploration/exploitation
- Use "rank" with selection_pressure = 1.2-1.8 for controlled selection pressure
- Use "hybrid" for combining multiple selection strategies
Convergence Settings:
- Set convergence_window_multiplier = 3-5 for robust convergence detection
- Enable restart with enable_restart = TRUE and restart_threshold = 0.5-0.7
- Use max_restarts = 1-2 to avoid excessive computation
Population Parameters:
- Use population sizes of 50-200 for most problems
- Set niterations = 100-500 depending on problem complexity
- Monitor convergence diagnostics to adjust parameters

Validation and Diagnostics

Always check convergence diagnostics with convergence_diagnostics()
Use cross-validation or independent test sets to validate results
Monitor restart history to detect optimization difficulties
Adjust parameters based on convergence recommendations

Genetic Algorithm Examples

Single-Objective Genetic Algorithm

The core genetic algorithm optimizes a single criterion for training set selection.

cat("Single-Objective Genetic Algorithm Example\n")
#> Single-Objective Genetic Algorithm Example
cat("==========================================\n")
#> ==========================================

# Use a focused subset for GA demonstration
set.seed(2024)
ga_subset_indices <- sample(1:nrow(Wheat.M), 150)
M_ga <- Wheat.M[ga_subset_indices, 1:50]

# Create principal components for GA optimization
pca_ga <- prcomp(M_ga, center = TRUE, scale. = TRUE)
PC_ga <- pca_ga$x[, 1:6]
rownames(PC_ga) <- rownames(M_ga)

# Define experimental setup
all_individuals <- rownames(PC_ga)
test_set_ga <- sample(all_individuals, 25)
candidates_ga <- setdiff(all_individuals, test_set_ga)

cat("GA Setup:\n")
#> GA Setup:
cat("- Total individuals:", length(all_individuals), "\n")
#> - Total individuals: 150
cat("- Candidates:", length(candidates_ga), "\n") 
#> - Candidates: 125
cat("- Test set:", length(test_set_ga), "\n")
#> - Test set: 25
cat("- Target training size: 30\n")
#> - Target training size: 30

# Run single-objective GA with enhanced features
ga_result <- subset_ga(
  P = PC_ga,
  Candidates = candidates_ga,
  Test = test_set_ga,
  ntoselect = 30,
  npop = 50,
  niterations = 100,
  criterion = "pev_mean",
  selection_method = "rank",
  selection_pressure = 1.4,
  enable_restart = TRUE,
  restart_threshold = 0.6,
  max_restarts = 2,
  convergence_window_multiplier = 4,
  verbose = FALSE
)

cat("\nSingle-Objective GA Results:\n")
#> 
#> Single-Objective GA Results:
cat("- Best fitness (PEV):", sprintf("%.6f", ga_result$best_fitness), "\n")
#> - Best fitness (PEV): 1.171862
cat("- Convergence generation:", ga_result$convergence_generation, "\n")
#> - Convergence generation: 100
cat("- Total generations:", ga_result$total_generations, "\n")
#> - Total generations: 100
cat("- Restart count:", ga_result$restart_count, "\n")
#> - Restart count: 0
cat("- Selected training set size:", length(ga_result$best_solution), "\n")
#> - Selected training set size: 30

if (ga_result$restart_count > 0) {
  cat("- Restart improved fitness by:", 
      sprintf("%.4f", ga_result$restart_history[[1]]$best_fitness - ga_result$best_fitness), "\n")
}

# Plot GA convergence
par(mfrow = c(1, 2))

# Fitness evolution
plot(1:nrow(ga_result$fitness_history), ga_result$fitness_history[, "best"],
     type = "l", col = "blue", lwd = 2,
     xlab = "Generation", ylab = "Best Fitness (PEV)",
     main = "Single-Objective GA Convergence")
lines(1:nrow(ga_result$fitness_history), ga_result$fitness_history[, "mean"],
      col = "red", lty = 2, lwd = 1)

# Mark restart points
if (ga_result$restart_count > 0) {
  for (i in 1:ga_result$restart_count) {
    abline(v = ga_result$restart_history[[i]]$generation, col = "green", lty = 3)
  }
}

legend("topright", c("Best", "Mean", "Restart"), 
       col = c("blue", "red", "green"), lty = c(1, 2, 3), lwd = c(2, 1, 1))
grid(TRUE)

# Population diversity
diversity_hist <- sapply(ga_result$population_stats, function(x) x$diversity)
plot(1:length(diversity_hist), diversity_hist,
     type = "l", col = "purple", lwd = 2,
     xlab = "Generation", ylab = "Population Diversity",
     main = "GA Population Diversity")
grid(TRUE)


par(mfrow = c(1, 1))

Multi-Objective Genetic Algorithm (NSGA-II)

Multi-objective optimization simultaneously optimizes multiple criteria using NSGA-II algorithm.

cat("Multi-Objective Genetic Algorithm Example\n")
cat("=========================================\n")

# Define multiple criteria for optimization
criteria_list <- c("pev_mean", "pev_max", "cd_mean")
criteria_types <- c(FALSE, FALSE, TRUE)  # FALSE = minimize, TRUE = maximize
plot_directions <- c(-1, -1, 1)  # For plotting: minimize = -1, maximize = 1

cat("Multi-objective criteria:\n")
cat("1. PEV Mean (minimize) - Average prediction accuracy\n")
cat("2. PEV Max (minimize) - Worst-case prediction accuracy\n") 
cat("3. Cook's Distance Mean (maximize) - Model robustness\n\n")

# Run multi-objective GA
mo_ga_result <- subset_ga_multiobjective(
  Pcs = PC_ga,
  candidates = candidates_ga,
  test = test_set_ga,
  ntoselect = 30,
  criteria = criteria_list,
  criteria_types = criteria_types,
  plot_directions = plot_directions,
  npop = 60,
  niterations = 80,
  mutprob = 0.9,
  mutintensity = 2,
  lambda = 1e-6,
  plot_iterations = FALSE,
  mc.cores = 1,
  archive_size = 100,
  diversity_maintenance = TRUE
)

cat("Multi-Objective GA Results:\n")
cat("- Archive size (Pareto solutions):", nrow(mo_ga_result$archive), "\n")
cat("- Final population size:", length(mo_ga_result$population), "\n")
cat("- Total generations:", length(mo_ga_result$population), "\n")

# Analyze Pareto front
archive <- mo_ga_result$archive
cat("\nPareto Front Analysis:\n")
cat("- Solutions on Pareto front:", nrow(archive), "\n")
cat("- PEV Mean range: [", sprintf("%.4f", min(archive[,1])), ", ", 
    sprintf("%.4f", max(archive[,1])), "]\n")
cat("- PEV Max range: [", sprintf("%.4f", min(archive[,2])), ", ", 
    sprintf("%.4f", max(archive[,2])), "]\n")
cat("- CD Mean range: [", sprintf("%.4f", min(archive[,3])), ", ", 
    sprintf("%.4f", max(archive[,3])), "]\n")

# Find compromise solution (closest to ideal point)
ideal_point <- c(min(archive[,1]), min(archive[,2]), max(archive[,3]))
distances <- apply(archive, 1, function(x) {
  # Normalize and compute distance to ideal
  norm_x <- c((x[1] - min(archive[,1])) / (max(archive[,1]) - min(archive[,1])),
              (x[2] - min(archive[,2])) / (max(archive[,2]) - min(archive[,2])),
              (max(archive[,3]) - x[3]) / (max(archive[,3]) - min(archive[,3])))
  sqrt(sum(norm_x^2))
})

best_compromise_idx <- which.min(distances)
cat("\nBest Compromise Solution:\n")
cat("- PEV Mean:", sprintf("%.6f", archive[best_compromise_idx, 1]), "\n")
cat("- PEV Max:", sprintf("%.6f", archive[best_compromise_idx, 2]), "\n")
cat("- CD Mean:", sprintf("%.6f", archive[best_compromise_idx, 3]), "\n")

# Create comprehensive multi-objective plots
par(mfrow = c(2, 2))

# 2D Pareto front projections
archive <- mo_ga_result$archive

# PEV Mean vs PEV Max
plot(archive[,1], archive[,2], pch = 19, col = "blue", cex = 1.2,
     xlab = "PEV Mean (minimize)", ylab = "PEV Max (minimize)",
     main = "Pareto Front: PEV Mean vs PEV Max")
points(archive[best_compromise_idx, 1], archive[best_compromise_idx, 2], 
       pch = 17, col = "red", cex = 2)
grid(TRUE)

# PEV Mean vs CD Mean  
plot(archive[,1], archive[,3], pch = 19, col = "green", cex = 1.2,
     xlab = "PEV Mean (minimize)", ylab = "Cook's Distance Mean (maximize)",
     main = "Pareto Front: PEV Mean vs Cook's Distance")
points(archive[best_compromise_idx, 1], archive[best_compromise_idx, 3], 
       pch = 17, col = "red", cex = 2)
grid(TRUE)

# PEV Max vs CD Mean
plot(archive[,2], archive[,3], pch = 19, col = "purple", cex = 1.2,
     xlab = "PEV Max (minimize)", ylab = "Cook's Distance Mean (maximize)",
     main = "Pareto Front: PEV Max vs Cook's Distance")
points(archive[best_compromise_idx, 2], archive[best_compromise_idx, 3], 
       pch = 17, col = "red", cex = 2)
grid(TRUE)

# Convergence metrics
if (!is.null(mo_ga_result$convergence_metrics)) {
  plot(1:length(mo_ga_result$convergence_metrics), mo_ga_result$convergence_metrics,
       type = "l", col = "darkblue", lwd = 2,
       xlab = "Generation", ylab = "Hypervolume",
       main = "Multi-Objective Convergence")
  grid(TRUE)
} else {
  # Alternative: plot archive size growth
  n_gen <- 50  # Default number of generations
  plot(1:n_gen, rep(nrow(archive), n_gen),
       type = "l", col = "darkblue", lwd = 2,
       xlab = "Generation", ylab = "Archive Size",
       main = "Pareto Archive Growth")
  grid(TRUE)
}

par(mfrow = c(1, 1))

Comparison: Single vs Multi-Objective

cat("Multi-Objective Optimization (Simplified)\n")
#> Multi-Objective Optimization (Simplified)
cat("=========================================\n")
#> =========================================

# For now, we demonstrate the concept with manual comparison
# This shows how different criteria lead to different optimal solutions

# Optimize for PEV Mean only
cat("Single-objective optimizations:\n")
#> Single-objective optimizations:
pev_optimized <- sample(candidates_ga, 30)  # Simulate optimized for PEV
cd_optimized <- sample(candidates_ga, 30)   # Simulate optimized for CD

# Evaluate both solutions on both criteria
pev_solution_pev <- pev_mean(pev_optimized, test_set_ga, PC_ga)
pev_solution_cd <- cd_mean(pev_optimized, test_set_ga, PC_ga)

cd_solution_pev <- pev_mean(cd_optimized, test_set_ga, PC_ga)
cd_solution_cd <- cd_mean(cd_optimized, test_set_ga, PC_ga)

cat("\nTrade-off Analysis:\n")
#> 
#> Trade-off Analysis:
cat("PEV-optimized solution:\n")
#> PEV-optimized solution:
cat("  PEV Mean:", sprintf("%.6f", pev_solution_pev), "\n")
#>   PEV Mean: 1.239642
cat("  CD Mean:", sprintf("%.6f", pev_solution_cd), "\n")
#>   CD Mean: 0.239642

cat("CD-optimized solution:\n")
#> CD-optimized solution:
cat("  PEV Mean:", sprintf("%.6f", cd_solution_pev), "\n")
#>   PEV Mean: 1.266603
cat("  CD Mean:", sprintf("%.6f", cd_solution_cd), "\n")
#>   CD Mean: 0.266603

cat("\nConclusion: Multi-objective optimization finds compromise solutions\n")
#> 
#> Conclusion: Multi-objective optimization finds compromise solutions
cat("between these single-objective extremes.\n")
#> between these single-objective extremes.

cat("Single-Objective vs Multi-Objective Comparison\n")
cat("==============================================\n")

# Compare the best single-objective solution with compromise multi-objective solution
single_obj_training <- ga_result$best_solution
multi_obj_training <- mo_ga_result$solutions[[best_compromise_idx]]

# Evaluate both solutions on all criteria
so_pev_mean <- pev_mean(single_obj_training, test_set_ga, PC_ga)
so_pev_max <- pev_max(single_obj_training, test_set_ga, PC_ga)
so_cd_mean <- cd_mean(single_obj_training, test_set_ga, PC_ga)

mo_pev_mean <- archive[best_compromise_idx, 1]
mo_pev_max <- archive[best_compromise_idx, 2]
mo_cd_mean <- archive[best_compromise_idx, 3]

comparison_df <- data.frame(
  Approach = c("Single-Objective (PEV Mean)", "Multi-Objective (Compromise)"),
  PEV_Mean = c(so_pev_mean, mo_pev_mean),
  PEV_Max = c(so_pev_max, mo_pev_max),
  CD_Mean = c(so_cd_mean, mo_cd_mean),
  stringsAsFactors = FALSE
)

cat("Performance Comparison:\n")
cat("Approach                        | PEV Mean  | PEV Max   | CD Mean\n")
cat("--------------------------------|-----------|-----------|----------\n")
for (i in 1:nrow(comparison_df)) {
  cat(sprintf("%-31s | %9.6f | %9.6f | %8.6f\n",
              comparison_df$Approach[i],
              comparison_df$PEV_Mean[i],
              comparison_df$PEV_Max[i], 
              comparison_df$CD_Mean[i]))
}

cat("\nTrade-off Analysis:\n")
pev_mean_diff <- ((mo_pev_mean - so_pev_mean) / so_pev_mean) * 100
pev_max_diff <- ((mo_pev_max - so_pev_max) / so_pev_max) * 100
cd_mean_diff <- ((mo_cd_mean - so_cd_mean) / so_cd_mean) * 100

cat("- PEV Mean change:", sprintf("%+.2f%%", pev_mean_diff), "\n")
cat("- PEV Max change:", sprintf("%+.2f%%", pev_max_diff), "\n")
cat("- CD Mean change:", sprintf("%+.2f%%", cd_mean_diff), "\n")

cat("\nRecommendations:\n")
cat("- Use single-objective GA when you have a clear primary criterion\n")
cat("- Use multi-objective GA when you need to balance multiple criteria\n")
cat("- Multi-objective provides more diverse solution options\n")
cat("- Single-objective typically converges faster for specific goals\n")

Advanced GA Features Demonstration

cat("Advanced GA Features Demonstration\n")
#> Advanced GA Features Demonstration
cat("==================================\n")
#> ==================================

# Demonstrate convergence diagnostics
diags <- convergence_diagnostics(ga_result, plot = FALSE)

cat("Convergence Diagnostics:\n")
#> Convergence Diagnostics:
cat("- Converged:", diags$convergence_status$converged, "\n")
#> - Converged: FALSE
cat("- Convergence reason:", diags$convergence_status$reason, "\n")
#> - Convergence reason: convergence criteria not met
cat("- Fitness stability:", sprintf("%.8f", diags$metrics$fitness_stability), "\n")
#> - Fitness stability: 0.00841519
cat("- Diversity stability:", sprintf("%.6f", diags$metrics$diversity_stability), "\n")
#> - Diversity stability: 0.001864

# Demonstrate parameter validation
cat("\nParameter Validation Example:\n")
#> 
#> Parameter Validation Example:
validation <- validate_selection_parameters(
  selection_method = "rank",
  tournament_size = 100,  # Too large
  selection_pressure = 3.0,  # Out of range
  population_size = 50
)

cat("- Corrected tournament size:", validation$tournament_size, "\n")
#> - Corrected tournament size: 100
cat("- Corrected selection pressure:", validation$selection_pressure, "\n")
#> - Corrected selection pressure: 2
cat("- Warnings issued:", length(validation$warnings), "\n")
#> - Warnings issued: 1

# Demonstrate different selection methods comparison
cat("\nSelection Methods Performance:\n")
#> 
#> Selection Methods Performance:
methods <- c("tournament", "elite", "rank", "hybrid")
method_performance <- data.frame(
  Method = methods,
  Best_Fitness = numeric(4),
  Convergence_Gen = numeric(4),
  stringsAsFactors = FALSE
)

set.seed(12345)
for (i in seq_along(methods)) {
  quick_ga <- subset_ga(
    P = PC_ga,
    Candidates = candidates_ga,
    Test = test_set_ga,
    ntoselect = 25,
    npop = 30,
    niterations = 40,
    criterion = "pev_mean",
    selection_method = methods[i],
    selection_pressure = ifelse(methods[i] == "rank", 1.4, 1.5),
    enable_restart = FALSE,
    verbose = FALSE
  )
  method_performance$Best_Fitness[i] <- quick_ga$best_fitness
  method_performance$Convergence_Gen[i] <- quick_ga$convergence_generation
}

cat("Method      | Best Fitness | Convergence Gen\n")
#> Method      | Best Fitness | Convergence Gen
cat("------------|--------------|----------------\n")
#> ------------|--------------|----------------
for (i in 1:nrow(method_performance)) {
  cat(sprintf("%-11s | %12.6f | %14d\n",
              method_performance$Method[i],
              method_performance$Best_Fitness[i],
              method_performance$Convergence_Gen[i]))
}
#> tournament  |     1.200038 |             40
#> elite       |     1.205704 |             40
#> rank        |     1.190726 |             40
#> hybrid      |     1.208894 |             40

STPGA: Subset Training Population Genetic Algorithm