--- title: "STPGA Package Demo Report" author: "Deniz Akdemir" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 2 vignette: > %\VignetteIndexEntry{STPGA Package Demo Report} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, eval = TRUE, warning = FALSE, message = FALSE) library(STPGA) ``` # STPGA Package v7.0.0 Demo This document demonstrates the modernized STPGA package functionality using the included WheatData dataset. ## Package Overview The STPGA package provides advanced genetic algorithms for optimal subset selection in genomic prediction and experimental design. Version 7.0.0 introduces significant enhancements including multi-criteria convergence detection, restart mechanisms, and enhanced selection methods. ### New Features in v7.0.0 - **Enhanced Convergence Detection**: Multi-criteria monitoring of fitness, diversity, and improvement rates - **Restart Mechanisms**: Automatic detection and recovery from premature convergence - **Advanced Selection Methods**: Rank-based selection with configurable pressure control - **Convergence Diagnostics**: Comprehensive analysis and optimization recommendations - **Parameter Validation**: Automatic correction of invalid parameters with user warnings For detailed coverage of these features, see the "Advanced Genetic Algorithm Features" vignette. ```{r package-info} # Package information cat("STPGA Package Version:", as.character(packageVersion("STPGA")), "\n") # Load the wheat dataset data("WheatData") cat("Dataset loaded:\n") cat("- Wheat.Y: Phenotype data (", nrow(Wheat.Y), " observations)\n") cat("- Wheat.M: Marker data (", nrow(Wheat.M), " individuals x ", ncol(Wheat.M), " markers)\n") cat("- Wheat.K: Kinship matrix (", nrow(Wheat.K), " x ", ncol(Wheat.K), ")\n") ``` ## Modern Criteria Functions The package now provides 8 clean, modern criteria functions: ```{r modern-criteria} # Setup data for demonstrations set.seed(123) all_individuals <- rownames(Wheat.M) test_set <- sample(all_individuals, 30) train_set <- sample(setdiff(all_individuals, test_set), 50) cat("Experimental setup:\n") cat("- Training set size:", length(train_set), "\n") cat("- Test set size:", length(test_set), "\n") # Use first 5 principal components for non-mixed model criteria # This is efficient for high-dimensional marker data cat("\nExtracting first 5 principal components...\n") pca_result <- prcomp(Wheat.M, center = TRUE, scale. = TRUE) Wheat.PC5 <- pca_result$x[, 1:5] rownames(Wheat.PC5) <- rownames(Wheat.M) cat("PC matrix dimensions:", nrow(Wheat.PC5), "x", ncol(Wheat.PC5), "\n") cat("Variance explained by first 5 PCs:", sprintf("%.2f%%", 100 * sum(pca_result$sdev[1:5]^2) / sum(pca_result$sdev^2)), "\n") # Test modern criteria using PCA cat("\nModern Criteria Results (using first 5 PCs):\n") cat("============================================\n") aopt <- a_optimality(train_set, test_set, Wheat.PC5) cat("A-optimality:", sprintf("%.2e", aopt), "\n") dopt <- d_optimality(train_set, test_set, Wheat.PC5) cat("D-optimality:", sprintf("%.2f", dopt), "\n") pev_mean_val <- pev_mean(train_set, test_set, Wheat.PC5, normalized = TRUE) cat("PEV Mean (normalized):", sprintf("%.6f", pev_mean_val), "\n") pev_max_val <- pev_max(train_set, test_set, Wheat.PC5, normalized = TRUE) cat("PEV Max (normalized):", sprintf("%.6f", pev_max_val), "\n") cd_mean_val <- cd_mean(train_set, test_set, Wheat.PC5, normalized = TRUE) cat("CD Mean (normalized):", sprintf("%.6f", cd_mean_val), "\n") ``` ## Training Set Size Analysis ```{r size-analysis} # Analyze effect of training set size using PCA sizes <- c(20, 40, 60, 80, 100) pev_values <- numeric(length(sizes)) cat("Training Set Size Analysis (using first 5 PCs):\n") cat("================================================\n") for (i in seq_along(sizes)) { train_size <- sample(setdiff(all_individuals, test_set), sizes[i]) pev_values[i] <- pev_mean(train_size, test_set, Wheat.PC5, normalized = TRUE) } # Create results table results_df <- data.frame( Size = sizes, PEV_Normalized = pev_values, Improvement = pev_values[1] / pev_values ) print(results_df) ``` ## Mixed Model Criteria ```{r mixed-models} # Test mixed model criteria with smaller subset for speed train_mm <- sample(setdiff(all_individuals, test_set), 30) test_mm <- sample(test_set, 15) # Use subset of markers for computational efficiency P_subset <- Wheat.M[c(train_mm, test_mm), 1:100] K_subset <- Wheat.K[c(train_mm, test_mm), c(train_mm, test_mm)] cat("Mixed Model Criteria (genomic selection):\n") cat("========================================\n") # Different heritability scenarios h2_scenarios <- list( high = list(Vg = 0.8, Ve = 0.2, name = "High h² (0.8)"), medium = list(Vg = 0.5, Ve = 0.5, name = "Medium h² (0.5)"), low = list(Vg = 0.2, Ve = 0.8, name = "Low h² (0.2)") ) cat("PEV by heritability:\n") for (scenario in h2_scenarios) { pev_mm <- pev_mean_mm(train_mm, test_mm, P_subset, K_subset, Vg = scenario$Vg, Ve = scenario$Ve) cat(sprintf(" %-15s: %.6f\n", scenario$name, pev_mm)) } ``` ## Unified Interface ```{r unified-interface} # Demonstrate the unified criterion() function using PCA cat("Unified Criterion Interface (using first 5 PCs):\n") cat("================================================\n") # Modern criteria names modern_criteria <- c("a_optimality", "d_optimality", "pev_mean", "cd_mean") for (crit in modern_criteria) { value <- criterion(train_set, test_set, Wheat.PC5, criterion = crit) cat(sprintf("%-15s: %12.2e\n", crit, value)) } # Test legacy compatibility cat("\nLegacy Compatibility:\n") aopt_modern <- criterion(train_set, test_set, Wheat.PC5, criterion = "a_optimality") aopt_legacy <- criterion(train_set, test_set, Wheat.PC5, criterion = "AOPT") cat("Modern a_optimality:", sprintf("%.6e", aopt_modern), "\n") cat("Legacy AOPT: ", sprintf("%.6e", aopt_legacy), "\n") cat("Values match:", abs(aopt_modern - aopt_legacy) < 1e-10, "\n") ``` ## Genetic Algorithm Optimization ### Single-Objective GA ```{r basic-ga-demo} cat("Basic Genetic Algorithm Demonstration\n") cat("====================================\n") # Use a smaller subset for quick demonstration set.seed(999) demo_indices <- sample(1:nrow(Wheat.M), 100) M_demo <- Wheat.M[demo_indices, 1:30] # Create principal components for GA pca_demo <- prcomp(M_demo, center = TRUE, scale. = TRUE) PC_demo <- pca_demo$x[, 1:4] rownames(PC_demo) <- rownames(M_demo) # Define sets all_demo <- rownames(PC_demo) test_demo <- sample(all_demo, 20) candidates_demo <- setdiff(all_demo, test_demo) cat("Setup:\n") cat("- Total individuals:", length(all_demo), "\n") cat("- Test set:", length(test_demo), "\n") cat("- Candidates:", length(candidates_demo), "\n") # Run basic GA ga_basic <- subset_ga( P = PC_demo, Candidates = candidates_demo, Test = test_demo, ntoselect = 20, npop = 30, niterations = 50, criterion = "pev_mean", verbose = FALSE ) cat("\nGA Results:\n") cat("- Best fitness:", sprintf("%.6f", ga_basic$best_fitness), "\n") cat("- Convergence generation:", ga_basic$convergence_generation, "\n") cat("- Training set selected:", length(ga_basic$best_solution), "\n") # Compare with random selection random_training <- sample(candidates_demo, 20) random_fitness <- pev_mean(random_training, test_demo, PC_demo) cat("\nComparison with Random Selection:\n") cat("- GA fitness:", sprintf("%.6f", ga_basic$best_fitness), "\n") cat("- Random fitness:", sprintf("%.6f", random_fitness), "\n") improvement <- ((random_fitness - ga_basic$best_fitness) / random_fitness) * 100 cat("- GA improvement:", sprintf("%.1f%%", improvement), "\n") ``` ### Multi-Objective GA Example ```{r basic-mo-ga-demo} cat("Basic Multi-Objective GA Demonstration\n") cat("======================================\n") # Define simple multi-objective optimization criteria_basic <- c("pev_mean", "cd_mean") criteria_types_basic <- c(FALSE, TRUE) # FALSE = minimize, TRUE = maximize plot_directions_basic <- c(-1, 1) cat("Optimizing two criteria:\n") cat("1. PEV Mean (minimize) - prediction accuracy\n") cat("2. Cook's Distance Mean (maximize) - robustness\n") # Run multi-objective GA mo_basic <- subset_ga_multiobjective( Pcs = PC_demo, candidates = candidates_demo, test = test_demo, ntoselect = 20, criteria = criteria_basic, criteria_types = criteria_types_basic, plot_directions = plot_directions_basic, npop = 40, niterations = 30, mutprob = 0.8, plot_iterations = FALSE ) cat("\nMulti-Objective Results:\n") if (!is.null(mo_basic$archive)) { cat("- Pareto solutions found:", nrow(mo_basic$archive), "\n") } else { cat("- Pareto solutions found: 0\n") } cat("- Final population size:", length(mo_basic$population), "\n") # Show Pareto front archive_basic <- mo_basic$archive if (!is.null(archive_basic) && nrow(archive_basic) > 0) { cat("\nPareto Front Range:\n") cat("- PEV Mean: [", sprintf("%.4f", min(archive_basic[,1])), ", ", sprintf("%.4f", max(archive_basic[,1])), "]\n") cat("- CD Mean: [", sprintf("%.4f", min(archive_basic[,2])), ", ", sprintf("%.4f", max(archive_basic[,2])), "]\n") # Find best compromise solution distances_basic <- apply(archive_basic, 1, function(x) { norm_pev <- (x[1] - min(archive_basic[,1])) / (max(archive_basic[,1]) - min(archive_basic[,1])) norm_cd <- (max(archive_basic[,2]) - x[2]) / (max(archive_basic[,2]) - min(archive_basic[,2])) sqrt(norm_pev^2 + norm_cd^2) }) best_idx <- which.min(distances_basic) cat("\nBest Compromise Solution:\n") cat("- PEV Mean:", sprintf("%.6f", archive_basic[best_idx, 1]), "\n") cat("- CD Mean:", sprintf("%.6f", archive_basic[best_idx, 2]), "\n") } else { cat("\nNo Pareto solutions found in archive\n") # Create dummy data for plotting archive_basic <- matrix(c(0.5, 0.3), nrow = 1) best_idx <- 1 } ``` ```{r plot-ga-comparison, fig.width=10, fig.height=6} # Plot GA results comparison par(mfrow = c(1, 2)) # Single-objective convergence plot(1:nrow(ga_basic$fitness_history), ga_basic$fitness_history[, "best"], type = "l", col = "blue", lwd = 2, xlab = "Generation", ylab = "Best Fitness (PEV)", main = "Single-Objective GA Convergence") lines(1:nrow(ga_basic$fitness_history), ga_basic$fitness_history[, "mean"], col = "red", lty = 2) legend("topright", c("Best", "Mean"), col = c("blue", "red"), lty = c(1, 2)) grid(TRUE) # Multi-objective Pareto front if (!is.null(archive_basic) && nrow(archive_basic) > 0) { plot(archive_basic[,1], archive_basic[,2], pch = 19, col = "darkgreen", cex = 1.2, xlab = "PEV Mean (minimize)", ylab = "Cook's Distance Mean (maximize)", main = "Multi-Objective Pareto Front") points(archive_basic[best_idx, 1], archive_basic[best_idx, 2], pch = 17, col = "red", cex = 2) legend("topright", c("Pareto Solutions", "Best Compromise"), pch = c(19, 17), col = c("darkgreen", "red")) } else { plot(0.5, 0.3, pch = 19, col = "darkgreen", cex = 1.2, xlab = "PEV Mean (minimize)", ylab = "Cook's Distance Mean (maximize)", main = "Multi-Objective Pareto Front (No Solutions)") text(0.5, 0.25, "No Pareto solutions found", cex = 1.2) } grid(TRUE) par(mfrow = c(1, 1)) ``` ## Summary The modernized STPGA package v7.0.0 provides: - **Clean API**: 8 intuitive function names instead of 18+ confusing ones - **Comprehensive Testing**: All functions tested and verified - **Mixed Model Support**: For genomic selection applications - **Numerical Stability**: Ridge regularization and robust matrix operations - **Legacy Compatibility**: Old function names still work The package successfully optimizes training set selection for genomic prediction using the wheat dataset, showing significant improvements over random selection. ## Session Information ```{r session-info} sessionInfo() ```