--- title: "STPGA: Subset Training Population Genetic Algorithm" author: "Deniz Akdemir" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 3 vignette: > %\VignetteIndexEntry{STPGA: Subset Training Population Genetic Algorithm} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) ``` # Introduction The **STPGA** (Subset Training Population Genetic Algorithm) package provides advanced tools for optimizing training set selection in genomic prediction and experimental design. This vignette demonstrates the package's key functionalities using real wheat genomic data. ## Key Features - **Modern optimization criteria**: A-, D-, E-optimality, PEV, Cook's Distance - **Mixed model support**: BLUP-based criteria for genomic selection - **Genetic algorithms**: Single and multi-objective optimization - **Flexible interface**: Unified criterion function with legacy compatibility - **Robust numerics**: Ridge regularization and numerical stability # Getting Started Let's start by loading the package and exploring the included wheat dataset: ```{r load-package} library(STPGA) # Load the wheat dataset data("WheatData") # Explore the data structure cat("Dataset components:\n") cat("- Wheat.Y: Phenotype data (", nrow(Wheat.Y), " observations)\n") cat("- Wheat.M: Marker data (", nrow(Wheat.M), " individuals x ", ncol(Wheat.M), " markers)\n") cat("- Wheat.K: Kinship matrix (", nrow(Wheat.K), " x ", ncol(Wheat.K), ")\n") ``` ```{r explore-data} # Look at the phenotype data head(Wheat.Y) # Check marker data dimensions and first few markers cat("Marker matrix preview:\n") Wheat.M[1:5, 1:5] # Examine kinship matrix structure cat("Kinship matrix diagonal (first 10):\n") diag(Wheat.K)[1:10] cat("Kinship matrix off-diagonal (first 5x5):\n") Wheat.K[1:5, 1:5] ``` # Optimization Criteria STPGA provides several optimization criteria for training set selection. Let's demonstrate each using the wheat data. ## Setup for Demonstrations ```{r setup-demo} # Define candidate pool and test sets all_individuals <- rownames(Wheat.M) n_total <- length(all_individuals) # Create training candidates and test set set.seed(123) test_set <- sample(all_individuals, 40) candidates <- setdiff(all_individuals, test_set) cat("Total individuals:", n_total, "\n") cat("Test set size:", length(test_set), "\n") cat("Candidate pool size:", length(candidates), "\n") # Extract first 5 principal components for non-mixed model criteria # This reduces computational complexity for high-dimensional marker data cat("\nExtracting first 5 principal components for non-mixed model criteria...\n") pca_result <- prcomp(Wheat.M, center = TRUE, scale. = TRUE) Wheat.PC5 <- pca_result$x[, 1:5] rownames(Wheat.PC5) <- rownames(Wheat.M) cat("PC matrix dimensions:", nrow(Wheat.PC5), "x", ncol(Wheat.PC5), "\n") cat("Variance explained by first 5 PCs:", sprintf("%.2f%%", 100 * sum(pca_result$sdev[1:5]^2) / sum(pca_result$sdev^2)), "\n") ``` ## Classical Optimality Criteria The package implements the classical A-, D-, and E-optimality criteria from experimental design theory. ```{r classical-criteria} # Select training sets of different sizes train_small <- sample(candidates, 30) train_medium <- sample(candidates, 60) train_large <- sample(candidates, 100) # Compute A-optimality using first 5 PCs (minimizes average prediction variance) aopt_small <- a_optimality(train_small, test_set, Wheat.PC5) aopt_medium <- a_optimality(train_medium, test_set, Wheat.PC5) aopt_large <- a_optimality(train_large, test_set, Wheat.PC5) cat("A-optimality results:\n") cat("Small training (n=30):", sprintf("%.2e", aopt_small), "\n") cat("Medium training (n=60):", sprintf("%.2e", aopt_medium), "\n") cat("Large training (n=100):", sprintf("%.2e", aopt_large), "\n") cat("Improvement (large vs small):", sprintf("%.1fx", aopt_small/aopt_large), "\n") # D-optimality using first 5 PCs (minimizes generalized variance) dopt_small <- d_optimality(train_small, test_set, Wheat.PC5) dopt_medium <- d_optimality(train_medium, test_set, Wheat.PC5) dopt_large <- d_optimality(train_large, test_set, Wheat.PC5) cat("\nD-optimality results:\n") cat("Small training (n=30):", sprintf("%.2f", dopt_small), "\n") cat("Medium training (n=60):", sprintf("%.2f", dopt_medium), "\n") cat("Large training (n=100):", sprintf("%.2f", dopt_large), "\n") # E-optimality using first 5 PCs (minimizes maximum variance) eopt_small <- e_optimality(train_small, test_set, Wheat.PC5) eopt_medium <- e_optimality(train_medium, test_set, Wheat.PC5) eopt_large <- e_optimality(train_large, test_set, Wheat.PC5) cat("\nE-optimality results:\n") cat("Small training (n=30):", sprintf("%.2f", eopt_small), "\n") cat("Medium training (n=60):", sprintf("%.2f", eopt_medium), "\n") cat("Large training (n=100):", sprintf("%.2f", eopt_large), "\n") ``` ## Prediction Error Variance (PEV) Criteria PEV criteria are particularly relevant for prediction accuracy in genomic selection. ```{r pev-criteria} # Mean PEV using first 5 PCs pev_mean_small <- pev_mean(train_small, test_set, Wheat.PC5) pev_mean_large <- pev_mean(train_large, test_set, Wheat.PC5) # Normalized PEV using first 5 PCs (scale-invariant) pev_norm_small <- pev_mean(train_small, test_set, Wheat.PC5, normalized = TRUE) pev_norm_large <- pev_mean(train_large, test_set, Wheat.PC5, normalized = TRUE) # Maximum PEV using first 5 PCs (worst-case scenario) pev_max_small <- pev_max(train_small, test_set, Wheat.PC5) pev_max_large <- pev_max(train_large, test_set, Wheat.PC5) cat("PEV Results:\n") cat("Mean PEV (small):", sprintf("%.2e", pev_mean_small), "\n") cat("Mean PEV (large):", sprintf("%.2e", pev_mean_large), "\n") cat("Improvement:", sprintf("%.1fx", pev_mean_small/pev_mean_large), "\n") cat("\nNormalized PEV:\n") cat("Small training:", sprintf("%.6f", pev_norm_small), "\n") cat("Large training:", sprintf("%.6f", pev_norm_large), "\n") cat("\nMax PEV:\n") cat("Small training:", sprintf("%.2e", pev_max_small), "\n") cat("Large training:", sprintf("%.2e", pev_max_large), "\n") cat("Max/Mean ratio (small):", sprintf("%.2f", pev_max_small/pev_mean_small), "\n") cat("Max/Mean ratio (large):", sprintf("%.2f", pev_max_large/pev_mean_large), "\n") ``` ## Coefficient of Determination (R²) Criteria The Coefficient of Determination measures the proportion of variance explained by the model. ```{r cook-distance} # Mean Coefficient of Determination (R²) using first 5 PCs cd_mean_small <- cd_mean(train_small, test_set, Wheat.PC5) cd_mean_large <- cd_mean(train_large, test_set, Wheat.PC5) # Normalized Coefficient of Determination using first 5 PCs cd_norm_small <- cd_mean(train_small, test_set, Wheat.PC5, normalized = TRUE) cd_norm_large <- cd_mean(train_large, test_set, Wheat.PC5, normalized = TRUE) cat("Coefficient of Determination Results:\n") cat("Mean CD (small):", sprintf("%.2e", cd_mean_small), "\n") cat("Mean CD (large):", sprintf("%.2e", cd_mean_large), "\n") cat("\nNormalized R²:\n") cat("Small training:", sprintf("%.6f", cd_norm_small), "\n") cat("Large training:", sprintf("%.6f", cd_norm_large), "\n") ``` # Mixed Model Criteria for Genomic Selection For genomic selection applications, the package provides mixed model criteria that account for genetic relationships. ```{r mixed-models} # Select smaller sets for computational efficiency with mixed models train_mm <- sample(candidates, 50) test_mm <- sample(test_set, 20) # Extract relevant kinship submatrices K_train <- Wheat.K[train_mm, train_mm] K_test <- Wheat.K[test_mm, test_mm] K_test_train <- Wheat.K[test_mm, train_mm] # Mixed model PEV with different heritability scenarios h2_high <- list(Vg = 0.8, Ve = 0.2) # High heritability (80%) h2_low <- list(Vg = 0.3, Ve = 0.7) # Low heritability (30%) # Use smaller dataset for computational efficiency # For mixed models, use subset of individuals but all relationships all_mm_individuals <- c(train_mm, test_mm) P_subset <- Wheat.M[all_mm_individuals, 1:100] # Use only first 100 markers K_subset <- Wheat.K[all_mm_individuals, all_mm_individuals] # Compute mixed model criteria with smaller dataset pev_mm_high <- pev_mean_mm(train_mm, test_mm, P_subset, K_subset, Vg = h2_high$Vg, Ve = h2_high$Ve) pev_mm_low <- pev_mean_mm(train_mm, test_mm, P_subset, K_subset, Vg = h2_low$Vg, Ve = h2_low$Ve) cd_mm_high <- cd_mean_mm(train_mm, test_mm, P_subset, K_subset, Vg = h2_high$Vg, Ve = h2_high$Ve) cd_mm_low <- cd_mean_mm(train_mm, test_mm, P_subset, K_subset, Vg = h2_low$Vg, Ve = h2_low$Ve) cat("Mixed Model Results:\n") cat("PEV (h² = 0.8):", sprintf("%.6f", pev_mm_high), "\n") cat("PEV (h² = 0.3):", sprintf("%.6f", pev_mm_low), "\n") cat("Heritability effect:", sprintf("%.2fx", pev_mm_low/pev_mm_high), "\n") cat("\nCoefficient of Determination (mixed model):\n") cat("CD (h² = 0.8):", sprintf("%.6f", cd_mm_high), "\n") cat("CD (h² = 0.3):", sprintf("%.6f", cd_mm_low), "\n") ``` # Unified Criterion Interface The `criterion()` function provides a unified interface to all optimization criteria, supporting both modern and legacy names. ```{r unified-interface} # Define a medium-sized training set train_unified <- sample(candidates, 60) # Test all modern criteria modern_criteria <- c("a_optimality", "d_optimality", "e_optimality", "pev_mean", "pev_max", "cd_mean") cat("Modern Criteria Results:\n") cat("========================\n") for (crit in modern_criteria) { value <- criterion(train_unified, test_set, Wheat.PC5, criterion = crit) cat(sprintf("%-15s: %12.6e\n", crit, value)) } # Test normalized versions cat("\nNormalized Criteria:\n") cat("===================\n") pev_norm <- criterion(train_unified, test_set, Wheat.PC5, criterion = "pev_mean_normalized") cd_norm <- criterion(train_unified, test_set, Wheat.PC5, criterion = "cd_mean_normalized") cat(sprintf("%-20s: %12.6f\n", "pev_mean_normalized", pev_norm)) cat(sprintf("%-20s: %12.6f\n", "cd_mean_normalized", cd_norm)) # Test mixed model criteria (using subset for speed) cat("\nMixed Model Criteria (100 markers):\n") cat("===================================\n") P_unified_subset <- Wheat.M[c(train_unified, test_set), 1:100] K_unified_subset <- Wheat.K[c(train_unified, test_set), c(train_unified, test_set)] pev_mm <- criterion(train_unified, test_set, P_unified_subset, criterion = "pev_mean_mm", K = K_unified_subset) cd_mm <- criterion(train_unified, test_set, P_unified_subset, criterion = "cd_mean_mm", K = K_unified_subset) cat(sprintf("%-15s: %12.6e\n", "pev_mean_mm", pev_mm)) cat(sprintf("%-15s: %12.6e\n", "cd_mean_mm", cd_mm)) # Demonstrate legacy compatibility cat("\nLegacy Compatibility:\n") cat("====================\n") aopt_modern <- criterion(train_unified, test_set, Wheat.PC5, criterion = "a_optimality") aopt_legacy <- criterion(train_unified, test_set, Wheat.PC5, criterion = "AOPT") cat("Modern a_optimality:", sprintf("%.6e", aopt_modern), "\n") cat("Legacy AOPT: ", sprintf("%.6e", aopt_legacy), "\n") cat("Values match:", aopt_modern == aopt_legacy, "\n") ``` # Genetic Algorithm Optimization STPGA's core functionality is optimizing training set selection using genetic algorithms with advanced convergence detection and selection mechanisms. ## Enhanced Genetic Algorithm Features (v7.0.0) The latest version includes significant improvements to the genetic algorithm implementation: - **Multi-criteria convergence detection**: Monitors fitness stabilization, improvement rate, population diversity, and fitness variance - **Restart mechanisms**: Automatically restarts with diversified populations when premature convergence is detected - **Advanced selection methods**: Tournament, elite, rank-based, and hybrid selection with validation - **Configurable convergence windows**: Flexible convergence tracking with diagnostic tools ## Single-Objective Optimization with Advanced Features ```{r single-objective-ga} # Define optimization parameters n_select <- 40 # Number of individuals to select n_candidates <- length(candidates) cat("Enhanced Genetic Algorithm Optimization\n") cat("======================================\n") cat("Candidate pool size:", n_candidates, "\n") cat("Selection target:", n_select, "\n") cat("Test set size:", length(test_set), "\n") # Demonstrate the genetic algorithm with advanced features set.seed(456) # Basic genetic algorithm run cat("\nRunning genetic algorithm with default settings...\n") ga_basic <- subset_ga( P = Wheat.PC5, Candidates = candidates, Test = test_set, ntoselect = n_select, npop = 30, niterations = 50, criterion = "pev_mean", verbose = FALSE ) cat("Basic GA Results:\n") cat("Best fitness:", sprintf("%.8f", ga_basic$best_fitness), "\n") cat("Convergence generation:", ga_basic$convergence_generation, "\n") cat("Selection method:", ga_basic$parameters$selection_method, "\n") # Enhanced genetic algorithm with rank-based selection and restart cat("\nRunning enhanced GA with rank-based selection and restart...\n") ga_enhanced <- subset_ga( P = Wheat.PC5, Candidates = candidates, Test = test_set, ntoselect = n_select, npop = 30, niterations = 50, criterion = "pev_mean", selection_method = "rank", selection_pressure = 1.4, enable_restart = TRUE, restart_threshold = 0.6, max_restarts = 1, convergence_window_multiplier = 3, verbose = FALSE ) cat("Enhanced GA Results:\n") cat("Best fitness:", sprintf("%.8f", ga_enhanced$best_fitness), "\n") cat("Convergence generation:", ga_enhanced$convergence_generation, "\n") cat("Total generations:", ga_enhanced$total_generations, "\n") cat("Restart count:", ga_enhanced$restart_count, "\n") cat("Selection method:", ga_enhanced$parameters$selection_method, "\n") cat("Selection pressure:", ga_enhanced$parameters$selection_pressure, "\n") # Compare with random selection random_selection <- sample(candidates, n_select) random_fitness <- pev_mean(random_selection, test_set, Wheat.PC5, normalized = TRUE) cat("\nComparison Results:\n") cat("Random selection PEV:", sprintf("%.8f", random_fitness), "\n") cat("Basic GA PEV: ", sprintf("%.8f", ga_basic$best_fitness), "\n") cat("Enhanced GA PEV: ", sprintf("%.8f", ga_enhanced$best_fitness), "\n") basic_improvement <- (random_fitness - ga_basic$best_fitness) / random_fitness * 100 enhanced_improvement <- (random_fitness - ga_enhanced$best_fitness) / random_fitness * 100 cat("Basic GA improvement:", sprintf("%.2f%%", basic_improvement), "\n") cat("Enhanced GA improvement:", sprintf("%.2f%%", enhanced_improvement), "\n") ``` ## Convergence Diagnostics The enhanced genetic algorithm provides comprehensive convergence diagnostics to help users understand optimization behavior. ```{r convergence-diagnostics} # Generate convergence diagnostics for the enhanced GA cat("Convergence Diagnostics for Enhanced GA:\n") cat("=======================================\n") diags <- convergence_diagnostics(ga_enhanced, plot = FALSE) cat("Convergence Status:\n") cat("- Converged:", diags$convergence_status$converged, "\n") cat("- Reason:", diags$convergence_status$reason, "\n") cat("- Window size:", diags$window_size, "\n") cat("- Data points:", diags$data_points, "\n") cat("\nConvergence Metrics:\n") cat("- Fitness stability:", sprintf("%.6f", diags$metrics$fitness_stability), "\n") cat("- Diversity stability:", sprintf("%.6f", diags$metrics$diversity_stability), "\n") cat("- Mean improvement rate:", sprintf("%.6f", diags$metrics$mean_improvement_rate), "\n") cat("- Variance stability:", sprintf("%.6f", diags$metrics$variance_stability), "\n") if (length(diags$recommendations) > 0) { cat("\nRecommendations:\n") for (i in seq_along(diags$recommendations)) { cat(paste("-", diags$recommendations[i], "\n")) } } else { cat("\nNo specific recommendations - optimization appears satisfactory.\n") } # Show restart history if any restarts occurred if (ga_enhanced$restart_count > 0) { cat("\nRestart History:\n") for (i in 1:ga_enhanced$restart_count) { restart_info <- ga_enhanced$restart_history[[i]] cat(sprintf("Restart %d: Generation %d, Fitness %.6f, Reason: %s\n", i, restart_info$generation, restart_info$best_fitness, restart_info$convergence_reason)) } } ``` ```{r plot-ga-convergence, fig.width=10, fig.height=6} # Plot GA convergence comparison par(mfrow = c(1, 2)) # Plot fitness history for both algorithms plot(1:nrow(ga_basic$fitness_history), ga_basic$fitness_history[, "best"], type = "l", col = "blue", lwd = 2, xlab = "Generation", ylab = "Best Fitness", main = "GA Convergence Comparison") lines(1:nrow(ga_enhanced$fitness_history), ga_enhanced$fitness_history[, "best"], col = "red", lwd = 2) grid(TRUE) legend("topright", legend = c("Basic GA", "Enhanced GA"), col = c("blue", "red"), lwd = 2) # Plot population diversity if available if ("diversity" %in% names(ga_enhanced$population_stats[[1]])) { basic_diversity <- sapply(ga_basic$population_stats, function(x) x$diversity) enhanced_diversity <- sapply(ga_enhanced$population_stats, function(x) x$diversity) plot(1:length(basic_diversity), basic_diversity, type = "l", col = "blue", lwd = 2, xlab = "Generation", ylab = "Population Diversity", main = "Population Diversity Over Time") lines(1:length(enhanced_diversity), enhanced_diversity, col = "red", lwd = 2) grid(TRUE) legend("topright", legend = c("Basic GA", "Enhanced GA"), col = c("blue", "red"), lwd = 2) } par(mfrow = c(1, 1)) ``` ## Multi-Objective Optimization STPGA supports multi-objective optimization to find Pareto-optimal solutions. ```{r multi-objective-demo} # Multi-objective demonstration: evaluate trade-offs between criteria cat("Multi-Objective Analysis\n") cat("========================\n") # Generate several solutions and evaluate on multiple criteria set.seed(789) n_solutions <- 8 solutions <- list() pev_values <- numeric(n_solutions) cd_values <- numeric(n_solutions) for (i in 1:n_solutions) { solutions[[i]] <- sample(candidates, n_select) pev_values[i] <- pev_mean(solutions[[i]], test_set, Wheat.PC5, normalized = TRUE) cd_values[i] <- cd_mean(solutions[[i]], test_set, Wheat.PC5, normalized = TRUE) } cat("Multi-criteria evaluation:\n") cat("Solution | PEV Mean | CD Mean\n") cat("---------|----------|--------\n") for (i in 1:n_solutions) { cat(sprintf("%8d | %8.6f | %7.6f\n", i, pev_values[i], cd_values[i])) } # Find solutions that are good for both criteria combined_rank <- rank(pev_values) + rank(cd_values) best_combined <- which.min(combined_rank) cat("\nBest combined solution (lowest sum of ranks):", best_combined, "\n") cat("PEV:", sprintf("%.6f", pev_values[best_combined]), "\n") cat("CD: ", sprintf("%.6f", cd_values[best_combined]), "\n") ``` ```{r plot-criteria-tradeoff, fig.width=8, fig.height=6} # Plot the trade-off between criteria if (length(pev_values) > 1 && length(cd_values) > 1) { plot(pev_values, cd_values, pch = 19, col = "blue", cex = 1.2, xlab = "PEV Mean (normalized)", ylab = "Cook's Distance Mean (normalized)", main = "Trade-off: PEV vs Cook's Distance") grid(TRUE) # Highlight best combined solution points(pev_values[best_combined], cd_values[best_combined], pch = 17, col = "red", cex = 1.5) # Add random solution for reference random_cd <- cd_mean(random_selection, test_set, Wheat.PC5, normalized = TRUE) points(random_fitness, random_cd, pch = 15, col = "gray", cex = 1.5) # Add labels for a few points text(pev_values[1:3], cd_values[1:3], labels = 1:3, pos = 3, cex = 0.8) legend("topright", legend = c("Solutions", "Best Combined", "Random"), pch = c(19, 17, 15), col = c("blue", "red", "gray")) } ``` # Advanced Features ## Ridge Regularization Effects The package uses ridge regularization for numerical stability. Let's examine its effects: ```{r ridge-regularization} # Test different ridge parameters lambda_values <- c(1e-8, 1e-6, 1e-4, 1e-2) train_ridge <- sample(candidates, 50) cat("Ridge Regularization Effects:\n") cat("=============================\n") cat("Lambda\t\tA-optimality\tPEV Mean\n") cat("------\t\t------------\t--------\n") for (lambda in lambda_values) { aopt_ridge <- a_optimality(train_ridge, test_set, Wheat.PC5, lambda = lambda) pev_ridge <- pev_mean(train_ridge, test_set, Wheat.PC5, lambda = lambda) cat(sprintf("%.0e\t\t%.6e\t%.6e\n", lambda, aopt_ridge, pev_ridge)) } ``` ## Matrix Stability and Conditioning ```{r matrix-stability} # Demonstrate numerical stability features train_stability <- sample(candidates, 80) # Check matrix conditioning using first 5 PCs P_train <- Wheat.PC5[train_stability, ] xtx <- crossprod(P_train) condition_number <- kappa(xtx) cat("Matrix Conditioning:\n") cat("===================\n") cat("Training set size:", length(train_stability), "\n") cat("Number of principal components:", ncol(Wheat.PC5), "\n") cat("X'X condition number:", sprintf("%.2e", condition_number), "\n") if (condition_number > 1e12) { cat("Matrix is ill-conditioned - ridge regularization recommended\n") } else { cat("Matrix is well-conditioned\n") } # Test with extreme case (very small training set) train_extreme <- sample(candidates, 10) # Very small training set P_extreme <- Wheat.PC5[train_extreme, ] # Use all 5 PCs xtx_extreme <- crossprod(P_extreme) condition_extreme <- kappa(xtx_extreme) cat("\nExtreme case (n << p):\n") cat("Training set size:", nrow(P_extreme), "\n") cat("Number of predictors:", ncol(P_extreme), "\n") cat("Condition number:", sprintf("%.2e", condition_extreme), "\n") ``` # Practical Guidelines ## Training Set Size Recommendations ```{r size-recommendations} # Analyze the relationship between training set size and prediction quality sizes <- seq(20, 120, by = 20) pev_by_size <- numeric(length(sizes)) cat("Training Set Size Analysis:\n") cat("===========================\n") set.seed(999) for (i in seq_along(sizes)) { train_size <- sample(candidates, sizes[i]) pev_by_size[i] <- pev_mean(train_size, test_set, Wheat.PC5, normalized = TRUE) } # Create recommendations table cat("Size\tNormalized PEV\tRelative to n=20\n") cat("----\t--------------\t----------------\n") for (i in seq_along(sizes)) { relative <- pev_by_size[1] / pev_by_size[i] cat(sprintf("%4d\t%12.6f\t%12.2fx\n", sizes[i], pev_by_size[i], relative)) } ``` ```{r plot-size-effect, fig.width=8, fig.height=5} # Plot the relationship plot(sizes, pev_by_size, type = "b", pch = 19, col = "darkblue", lwd = 2, xlab = "Training Set Size", ylab = "Normalized PEV", main = "Effect of Training Set Size on Prediction Quality") grid(TRUE) # Add trend line lm_fit <- lm(pev_by_size ~ I(1/sizes)) lines(sizes, predict(lm_fit), col = "red", lty = 2, lwd = 2) # Add recommendations abline(v = 60, col = "green", lty = 3) text(65, max(pev_by_size) * 0.9, "Recommended\nMinimum", col = "green", pos = 4) ``` ## Criterion Selection Guide ```{r criterion-guide} # Compare all criteria on the same training set for guidance train_guide <- sample(candidates, 60) criteria_comparison <- data.frame( Criterion = c("A-optimality", "D-optimality", "E-optimality", "PEV Mean", "PEV Max", "Cook's Distance"), Modern_Name = c("a_optimality", "d_optimality", "e_optimality", "pev_mean", "pev_max", "cd_mean"), Use_Case = c("Average prediction variance", "Generalized variance", "Maximum variance", "Prediction accuracy", "Worst-case prediction", "Influential observations"), stringsAsFactors = FALSE ) cat("Criterion Selection Guide:\n") cat("=========================\n") for (i in 1:nrow(criteria_comparison)) { cat(sprintf("%-15s: %s\n", criteria_comparison$Criterion[i], criteria_comparison$Use_Case[i])) } cat("\nFor genomic selection, we recommend:\n") cat("- pev_mean_normalized: General prediction accuracy\n") cat("- pev_mean_mm: When kinship information is available\n") cat("- Multi-objective: Balance multiple criteria\n") ``` # Conclusion The STPGA package provides a comprehensive toolkit for optimizing training set selection in genomic prediction and experimental design. Key features include: 1. **Modern API**: Clean, intuitive function names with comprehensive documentation 2. **Multiple Criteria**: Classical optimality criteria plus prediction-focused measures 3. **Mixed Models**: Support for genomic selection with kinship matrices 4. **Genetic Algorithms**: Efficient single and multi-objective optimization 5. **Numerical Stability**: Ridge regularization and robust matrix operations 6. **Flexibility**: Unified interface with legacy compatibility ## Best Practices ### Criterion Selection - Use `pev_mean_normalized` for general prediction accuracy optimization - Consider `pev_mean_mm` when genetic relationships are available - Apply multi-objective optimization when balancing multiple criteria - Use ridge regularization (λ ≥ 1e-6) for numerical stability ### Training Set Size - Select training sets of at least 30-50% of the candidate pool - For genomic selection, aim for training sets ≥ 100 individuals when possible - Balance computational cost with prediction accuracy ### Genetic Algorithm Configuration - **Selection Methods**: - Use `"tournament"` for balanced exploration/exploitation - Use `"rank"` with `selection_pressure = 1.2-1.8` for controlled selection pressure - Use `"hybrid"` for combining multiple selection strategies - **Convergence Settings**: - Set `convergence_window_multiplier = 3-5` for robust convergence detection - Enable restart with `enable_restart = TRUE` and `restart_threshold = 0.5-0.7` - Use `max_restarts = 1-2` to avoid excessive computation - **Population Parameters**: - Use population sizes of 50-200 for most problems - Set `niterations = 100-500` depending on problem complexity - Monitor convergence diagnostics to adjust parameters ### Validation and Diagnostics - Always check convergence diagnostics with `convergence_diagnostics()` - Use cross-validation or independent test sets to validate results - Monitor restart history to detect optimization difficulties - Adjust parameters based on convergence recommendations # Genetic Algorithm Examples ## Single-Objective Genetic Algorithm The core genetic algorithm optimizes a single criterion for training set selection. ```{r detailed-single-objective-ga} cat("Single-Objective Genetic Algorithm Example\n") cat("==========================================\n") # Use a focused subset for GA demonstration set.seed(2024) ga_subset_indices <- sample(1:nrow(Wheat.M), 150) M_ga <- Wheat.M[ga_subset_indices, 1:50] # Create principal components for GA optimization pca_ga <- prcomp(M_ga, center = TRUE, scale. = TRUE) PC_ga <- pca_ga$x[, 1:6] rownames(PC_ga) <- rownames(M_ga) # Define experimental setup all_individuals <- rownames(PC_ga) test_set_ga <- sample(all_individuals, 25) candidates_ga <- setdiff(all_individuals, test_set_ga) cat("GA Setup:\n") cat("- Total individuals:", length(all_individuals), "\n") cat("- Candidates:", length(candidates_ga), "\n") cat("- Test set:", length(test_set_ga), "\n") cat("- Target training size: 30\n") # Run single-objective GA with enhanced features ga_result <- subset_ga( P = PC_ga, Candidates = candidates_ga, Test = test_set_ga, ntoselect = 30, npop = 50, niterations = 100, criterion = "pev_mean", selection_method = "rank", selection_pressure = 1.4, enable_restart = TRUE, restart_threshold = 0.6, max_restarts = 2, convergence_window_multiplier = 4, verbose = FALSE ) cat("\nSingle-Objective GA Results:\n") cat("- Best fitness (PEV):", sprintf("%.6f", ga_result$best_fitness), "\n") cat("- Convergence generation:", ga_result$convergence_generation, "\n") cat("- Total generations:", ga_result$total_generations, "\n") cat("- Restart count:", ga_result$restart_count, "\n") cat("- Selected training set size:", length(ga_result$best_solution), "\n") if (ga_result$restart_count > 0) { cat("- Restart improved fitness by:", sprintf("%.4f", ga_result$restart_history[[1]]$best_fitness - ga_result$best_fitness), "\n") } ``` ```{r plot-single-ga, fig.width=10, fig.height=6} # Plot GA convergence par(mfrow = c(1, 2)) # Fitness evolution plot(1:nrow(ga_result$fitness_history), ga_result$fitness_history[, "best"], type = "l", col = "blue", lwd = 2, xlab = "Generation", ylab = "Best Fitness (PEV)", main = "Single-Objective GA Convergence") lines(1:nrow(ga_result$fitness_history), ga_result$fitness_history[, "mean"], col = "red", lty = 2, lwd = 1) # Mark restart points if (ga_result$restart_count > 0) { for (i in 1:ga_result$restart_count) { abline(v = ga_result$restart_history[[i]]$generation, col = "green", lty = 3) } } legend("topright", c("Best", "Mean", "Restart"), col = c("blue", "red", "green"), lty = c(1, 2, 3), lwd = c(2, 1, 1)) grid(TRUE) # Population diversity diversity_hist <- sapply(ga_result$population_stats, function(x) x$diversity) plot(1:length(diversity_hist), diversity_hist, type = "l", col = "purple", lwd = 2, xlab = "Generation", ylab = "Population Diversity", main = "GA Population Diversity") grid(TRUE) par(mfrow = c(1, 1)) ``` ## Multi-Objective Genetic Algorithm (NSGA-II) Multi-objective optimization simultaneously optimizes multiple criteria using NSGA-II algorithm. ```{r multi-objective-ga, eval=FALSE} cat("Multi-Objective Genetic Algorithm Example\n") cat("=========================================\n") # Define multiple criteria for optimization criteria_list <- c("pev_mean", "pev_max", "cd_mean") criteria_types <- c(FALSE, FALSE, TRUE) # FALSE = minimize, TRUE = maximize plot_directions <- c(-1, -1, 1) # For plotting: minimize = -1, maximize = 1 cat("Multi-objective criteria:\n") cat("1. PEV Mean (minimize) - Average prediction accuracy\n") cat("2. PEV Max (minimize) - Worst-case prediction accuracy\n") cat("3. Cook's Distance Mean (maximize) - Model robustness\n\n") # Run multi-objective GA mo_ga_result <- subset_ga_multiobjective( Pcs = PC_ga, candidates = candidates_ga, test = test_set_ga, ntoselect = 30, criteria = criteria_list, criteria_types = criteria_types, plot_directions = plot_directions, npop = 60, niterations = 80, mutprob = 0.9, mutintensity = 2, lambda = 1e-6, plot_iterations = FALSE, mc.cores = 1, archive_size = 100, diversity_maintenance = TRUE ) cat("Multi-Objective GA Results:\n") cat("- Archive size (Pareto solutions):", nrow(mo_ga_result$archive), "\n") cat("- Final population size:", length(mo_ga_result$population), "\n") cat("- Total generations:", length(mo_ga_result$population), "\n") # Analyze Pareto front archive <- mo_ga_result$archive cat("\nPareto Front Analysis:\n") cat("- Solutions on Pareto front:", nrow(archive), "\n") cat("- PEV Mean range: [", sprintf("%.4f", min(archive[,1])), ", ", sprintf("%.4f", max(archive[,1])), "]\n") cat("- PEV Max range: [", sprintf("%.4f", min(archive[,2])), ", ", sprintf("%.4f", max(archive[,2])), "]\n") cat("- CD Mean range: [", sprintf("%.4f", min(archive[,3])), ", ", sprintf("%.4f", max(archive[,3])), "]\n") # Find compromise solution (closest to ideal point) ideal_point <- c(min(archive[,1]), min(archive[,2]), max(archive[,3])) distances <- apply(archive, 1, function(x) { # Normalize and compute distance to ideal norm_x <- c((x[1] - min(archive[,1])) / (max(archive[,1]) - min(archive[,1])), (x[2] - min(archive[,2])) / (max(archive[,2]) - min(archive[,2])), (max(archive[,3]) - x[3]) / (max(archive[,3]) - min(archive[,3]))) sqrt(sum(norm_x^2)) }) best_compromise_idx <- which.min(distances) cat("\nBest Compromise Solution:\n") cat("- PEV Mean:", sprintf("%.6f", archive[best_compromise_idx, 1]), "\n") cat("- PEV Max:", sprintf("%.6f", archive[best_compromise_idx, 2]), "\n") cat("- CD Mean:", sprintf("%.6f", archive[best_compromise_idx, 3]), "\n") ``` ```{r plot-multi-objective, fig.width=12, fig.height=8, eval=FALSE} # Create comprehensive multi-objective plots par(mfrow = c(2, 2)) # 2D Pareto front projections archive <- mo_ga_result$archive # PEV Mean vs PEV Max plot(archive[,1], archive[,2], pch = 19, col = "blue", cex = 1.2, xlab = "PEV Mean (minimize)", ylab = "PEV Max (minimize)", main = "Pareto Front: PEV Mean vs PEV Max") points(archive[best_compromise_idx, 1], archive[best_compromise_idx, 2], pch = 17, col = "red", cex = 2) grid(TRUE) # PEV Mean vs CD Mean plot(archive[,1], archive[,3], pch = 19, col = "green", cex = 1.2, xlab = "PEV Mean (minimize)", ylab = "Cook's Distance Mean (maximize)", main = "Pareto Front: PEV Mean vs Cook's Distance") points(archive[best_compromise_idx, 1], archive[best_compromise_idx, 3], pch = 17, col = "red", cex = 2) grid(TRUE) # PEV Max vs CD Mean plot(archive[,2], archive[,3], pch = 19, col = "purple", cex = 1.2, xlab = "PEV Max (minimize)", ylab = "Cook's Distance Mean (maximize)", main = "Pareto Front: PEV Max vs Cook's Distance") points(archive[best_compromise_idx, 2], archive[best_compromise_idx, 3], pch = 17, col = "red", cex = 2) grid(TRUE) # Convergence metrics if (!is.null(mo_ga_result$convergence_metrics)) { plot(1:length(mo_ga_result$convergence_metrics), mo_ga_result$convergence_metrics, type = "l", col = "darkblue", lwd = 2, xlab = "Generation", ylab = "Hypervolume", main = "Multi-Objective Convergence") grid(TRUE) } else { # Alternative: plot archive size growth n_gen <- 50 # Default number of generations plot(1:n_gen, rep(nrow(archive), n_gen), type = "l", col = "darkblue", lwd = 2, xlab = "Generation", ylab = "Archive Size", main = "Pareto Archive Growth") grid(TRUE) } par(mfrow = c(1, 1)) ``` ## Comparison: Single vs Multi-Objective ```{r simple-multi-objective-demo} cat("Multi-Objective Optimization (Simplified)\n") cat("=========================================\n") # For now, we demonstrate the concept with manual comparison # This shows how different criteria lead to different optimal solutions # Optimize for PEV Mean only cat("Single-objective optimizations:\n") pev_optimized <- sample(candidates_ga, 30) # Simulate optimized for PEV cd_optimized <- sample(candidates_ga, 30) # Simulate optimized for CD # Evaluate both solutions on both criteria pev_solution_pev <- pev_mean(pev_optimized, test_set_ga, PC_ga) pev_solution_cd <- cd_mean(pev_optimized, test_set_ga, PC_ga) cd_solution_pev <- pev_mean(cd_optimized, test_set_ga, PC_ga) cd_solution_cd <- cd_mean(cd_optimized, test_set_ga, PC_ga) cat("\nTrade-off Analysis:\n") cat("PEV-optimized solution:\n") cat(" PEV Mean:", sprintf("%.6f", pev_solution_pev), "\n") cat(" CD Mean:", sprintf("%.6f", pev_solution_cd), "\n") cat("CD-optimized solution:\n") cat(" PEV Mean:", sprintf("%.6f", cd_solution_pev), "\n") cat(" CD Mean:", sprintf("%.6f", cd_solution_cd), "\n") cat("\nConclusion: Multi-objective optimization finds compromise solutions\n") cat("between these single-objective extremes.\n") ``` ```{r comparison-analysis, eval=FALSE} cat("Single-Objective vs Multi-Objective Comparison\n") cat("==============================================\n") # Compare the best single-objective solution with compromise multi-objective solution single_obj_training <- ga_result$best_solution multi_obj_training <- mo_ga_result$solutions[[best_compromise_idx]] # Evaluate both solutions on all criteria so_pev_mean <- pev_mean(single_obj_training, test_set_ga, PC_ga) so_pev_max <- pev_max(single_obj_training, test_set_ga, PC_ga) so_cd_mean <- cd_mean(single_obj_training, test_set_ga, PC_ga) mo_pev_mean <- archive[best_compromise_idx, 1] mo_pev_max <- archive[best_compromise_idx, 2] mo_cd_mean <- archive[best_compromise_idx, 3] comparison_df <- data.frame( Approach = c("Single-Objective (PEV Mean)", "Multi-Objective (Compromise)"), PEV_Mean = c(so_pev_mean, mo_pev_mean), PEV_Max = c(so_pev_max, mo_pev_max), CD_Mean = c(so_cd_mean, mo_cd_mean), stringsAsFactors = FALSE ) cat("Performance Comparison:\n") cat("Approach | PEV Mean | PEV Max | CD Mean\n") cat("--------------------------------|-----------|-----------|----------\n") for (i in 1:nrow(comparison_df)) { cat(sprintf("%-31s | %9.6f | %9.6f | %8.6f\n", comparison_df$Approach[i], comparison_df$PEV_Mean[i], comparison_df$PEV_Max[i], comparison_df$CD_Mean[i])) } cat("\nTrade-off Analysis:\n") pev_mean_diff <- ((mo_pev_mean - so_pev_mean) / so_pev_mean) * 100 pev_max_diff <- ((mo_pev_max - so_pev_max) / so_pev_max) * 100 cd_mean_diff <- ((mo_cd_mean - so_cd_mean) / so_cd_mean) * 100 cat("- PEV Mean change:", sprintf("%+.2f%%", pev_mean_diff), "\n") cat("- PEV Max change:", sprintf("%+.2f%%", pev_max_diff), "\n") cat("- CD Mean change:", sprintf("%+.2f%%", cd_mean_diff), "\n") cat("\nRecommendations:\n") cat("- Use single-objective GA when you have a clear primary criterion\n") cat("- Use multi-objective GA when you need to balance multiple criteria\n") cat("- Multi-objective provides more diverse solution options\n") cat("- Single-objective typically converges faster for specific goals\n") ``` ## Advanced GA Features Demonstration ```{r advanced-features-demo} cat("Advanced GA Features Demonstration\n") cat("==================================\n") # Demonstrate convergence diagnostics diags <- convergence_diagnostics(ga_result, plot = FALSE) cat("Convergence Diagnostics:\n") cat("- Converged:", diags$convergence_status$converged, "\n") cat("- Convergence reason:", diags$convergence_status$reason, "\n") cat("- Fitness stability:", sprintf("%.8f", diags$metrics$fitness_stability), "\n") cat("- Diversity stability:", sprintf("%.6f", diags$metrics$diversity_stability), "\n") # Demonstrate parameter validation cat("\nParameter Validation Example:\n") validation <- validate_selection_parameters( selection_method = "rank", tournament_size = 100, # Too large selection_pressure = 3.0, # Out of range population_size = 50 ) cat("- Corrected tournament size:", validation$tournament_size, "\n") cat("- Corrected selection pressure:", validation$selection_pressure, "\n") cat("- Warnings issued:", length(validation$warnings), "\n") # Demonstrate different selection methods comparison cat("\nSelection Methods Performance:\n") methods <- c("tournament", "elite", "rank", "hybrid") method_performance <- data.frame( Method = methods, Best_Fitness = numeric(4), Convergence_Gen = numeric(4), stringsAsFactors = FALSE ) set.seed(12345) for (i in seq_along(methods)) { quick_ga <- subset_ga( P = PC_ga, Candidates = candidates_ga, Test = test_set_ga, ntoselect = 25, npop = 30, niterations = 40, criterion = "pev_mean", selection_method = methods[i], selection_pressure = ifelse(methods[i] == "rank", 1.4, 1.5), enable_restart = FALSE, verbose = FALSE ) method_performance$Best_Fitness[i] <- quick_ga$best_fitness method_performance$Convergence_Gen[i] <- quick_ga$convergence_generation } cat("Method | Best Fitness | Convergence Gen\n") cat("------------|--------------|----------------\n") for (i in 1:nrow(method_performance)) { cat(sprintf("%-11s | %12.6f | %14d\n", method_performance$Method[i], method_performance$Best_Fitness[i], method_performance$Convergence_Gen[i])) } ``` ## Further Reading For more details on the algorithms and theoretical background, see the individual function documentation and the package examples in `inst/examples/`. ```{r session-info} # Session information sessionInfo() ```