# Homework 06 Solutions

## R as a Research Tool HW 06: simple evolution simulation solutions ## Instructions ## ------------ ## This file contains several complete functions you've seen before: ## mateNumericVectors() and testFitness(). In addition, I've written a new ## function called simpleEvolve(), which creates a haploid population, creates ## a random optimal genotype, and then simulates evolution over a set number of ## generations. This new function depends upon two other functions. One simple ## one I've supplied (getRandChrom), but the other you will write yourself: ## getNextGeneration. ## Your tasks: ## 1. Write the getNextGeneration function (see comment header I supply). Hint: ## look up the "prob" argument to the sample function to see how to choose ## among a list or vector according to weighted probabilities (fitnesses). ## 2. Comment all of the new code (eg everything except mateNumericVectors() ## and testFitness(). ## 3. Write code at the end of the script to search through a reasonable number ## of values of mu and r and report which values result in the most rapid ## convergence of the population mean fitness to > 0.9. You will need to ## extract the num.generations element from the result of each run. ############################################################################### ## simple-evolve.R: ## A simple simulation of haploid evolution. ## global constants: ALLELES <- 1:10 # possible allele values CHROM_LENGTH <- 10 # length of "chomosome" vectors POP_SIZE <- 100 # population size FITNESS_CRITERION <- 0.9 # default mean population fitness at which to end # simulation getRandChrom <- function(lngth=CHROM_LENGTH){ return(sample(ALLELES, lngth, replace=TRUE)) } # mateNumericVectors(x, y, r, mu=0): # Returns a child vector created by "mating" two parent vectors x and y and # supplying a per-adjacent-locus recombination rate and a per-locus mutation # rate. During mutation, an allele is randomly assigned a value between 1 and # 100, inclusive. # # Args: # - x, y = Two vectors each representing a "genotype". The vectors should be # atomic vectors containing integer values between 1 and 100 # inclusive. These vectors are the two parent genotypes and must be # of the same length. # - r = The per-locus recombination rate. A rate between 0 and 1 which # indicates the probability of a crossover event between any two # adjacent genes (vector elements). Rates between 0 and 0.5 make the # most sense biologically. # - mu The per-locus mutation rate. # # Returns: # - A vector of the same length as the parents mateNumericVectors <- function(x, y, r, mu) { n <- length(x) stopifnot(n==length(y)) crossovers <- runif(n) <= r useFirstParent <- cumsum(crossovers) %% 2 == 0 if(runif(1) < 0.5) useFirstParent <- !useFirstParent child <- ifelse(useFirstParent,x,y) mutations <- runif(n) <= mu child[mutations] <- sample(ALLELES, sum(mutations), replace=TRUE) return(child) } # testFitness(genotype, targetGenotype): # Returns a scalar number representing how well the genotype (a vector of # numbers) matches the target genotype. The matching is measured as the # recipricol sum of the squared deviations from the target (1 / (sse+1) ). # # Args: # - genotype = the chromosome to be tested. # - targetGenotype = The optimum genotype. These must be numeric # vectors of the same length. # # Returns: # - A scalar: the reciprocal of the sum of the squared deviations # of the genotype from the target (1 / (sse+1) ). testFitness <- function(genotype, targetGenotype){ stopifnot(length(genotype)==length(targetGenotype)) return(1.0 / (sum((genotype-targetGenotype)^2) + 1.0 ) ) } # getNextGeneration(population, r, mu, fitnesses): # # Accepts a list of genotypes ("population") and returns the new population # replacing each individual with the result of mating two vectors chosen # according to their fitnesses. Note that these are non-overlapping # generations: the parents should be chosen from the existing generation and # not from any newly selected offspring. The basic algorithm: 1) for each spot # in the new population select two parents from the current population. These # are chosen with weighted probability based on their fitnesses (see fitnesses # argument). 2) Use mateVectors to produce an offspring and add the offspring # to the new population 4) When we have a new population of the same size as # the original, return the new population. # # Args: # - population = a list of chromosomes (a list of numeric vectors) # - r = per-adjacent-locus reocmbination rate # - mu = a per locus mutation rate # - fitnesses = a vector of real numbers corresponding to the fitness # values of each individual in the populations. This must # be of the same length as population. # # Returns: # - A list of vectors: the new population getNextGeneration <- function(population, r, mu, fitnesses) { N = length(population) parents <- sample(population, N*2, prob=fitnesses, replace = TRUE) new_pop <- mapply(mateNumericVectors, parents[1:N], parents[(N+1):(N*2)], r, mu, SIMPLIFY=FALSE) return(new_pop) } ## Alternative implementation with loops (slower because of all of the separate ## calls to sample() ) getNextGeneration.loop <- function(population, r, mu, fitnesses) { # make empty list of correct length for results new.pop <- vector("list", length=length(population)) # cycle through population and make offspring for (i in 1:length(population)) { parents <- sample(population, 2, prob=fitnesses, replace=TRUE) # allow selfing new.pop[[i]] <- mateNumericVectors(parents[[1]], parents[[2]], r, mu) } return(new.pop) } ## DWS it would also be possible to use a loop but first get all of the ## potential parents as in the first version. # simpleEvolve(generations, r, mu, break.fit = FITNESS_CRITERION): # # This function simulates evolution among haploid individuals. The population # size is determined by the global constant, POP_SIZE, the chromosome length is # determined by the global variable CHROM_LENGTH (used in getRandChrom(). Thus # function accepts arguments controlling the maximum number of generations to # run, recombination rate, mutation rate and the population mean fitness value # (0-1) that is the threshold for terminating the simulation. The function # simulates non-overlappig generations and fitness is aplied at the parent # selection stage. In other words, all individuals die each generation and are # replaced with offspring whose parents where selected randomly but weighted by # the fitnesess of all potential parents. # # Args: # - generations = a scaler integer. The maximum number of generations # to run # - r = per-adjacent-locus recombination rate # - mu = a per locus mutation rate # - break.fit = a threshold value for the population mean fitness. # When the mean fitness of all individuals in the # population meets or exceeds this value, the # simulation temrinates. # - do.hist = flag to set plotting output side effect. Shows moving # histogram of population fitnesses when do.hist=TRUE # # Returns: # - A list with the following components: 1) "optimal.genotype": the # target genotype that defined perfect fitness for that run, 2) # "mean.fit": the mean fitness of the population at simulation end, and # 3) "num.generations": the number of generations it took to reach that # mean fitness. The maximum vlaue is determined by the "generations" # argument. simpleEvolve <- function(generations, r, mu, break.fit = FITNESS_CRITERION, do.hist=FALSE) { optimum <- getRandChrom() population <- replicate(POP_SIZE, getRandChrom(), simplify=FALSE) for(gen in 1:generations){ allfits <- sapply(population, testFitness, targetGenotype=optimum) meanfit <- mean(allfits) if(do.hist) { hist(sqrt(allfits), freq=FALSE, breaks=seq(0,1,0.02),col="gray", xlab="Square root of fitness") } if ( meanfit > break.fit) break population <- getNextGeneration(population, r, mu, allfits) } return(list("optimal.genotype" = optimum, "mean.fit"=meanfit, "num.generations"=gen)) } ############################################################################### ## Part 3 ############################################################################## ## search parameter space searchParams <- function(max.generations, rs, mus, nreps=1) { df <- data.frame(r = vector("numeric", 0), mu = vector("numeric", 0), ngen = vector("numeric", 0)) for(r in rs) { for(mu in mus) { for (rep in 1:nreps) { run <- simpleEvolve(max.generations, r, mu) print(sprintf("Rep %d, r=%.3f, mu=%.3f, ngen=%d", rep, r, mu, run$num.generations)) df <- rbind(df, data.frame(r=r, mu=mu, ngen=run$num.generations)) } } } return(df) } ### examples and parameter search library(dplyr) library(ggplot2) library(rgl) ## simpleEvolve examples with moving histogram: simpleEvolve(450, r=.01, mu=0.001, do.hist=TRUE) ## simpleEvolve(500, r=0.5, mu=0.001, do.hist= TRUE) ## simpleEvolve(500, r=0.5, mu=0.001, break.fit=0.95) ## Search below is time consuming, so only run if needed. # d <- searchParams(400, seq(0.0, 0.3, 0.02), seq(0.0, 0.03, 0.002), nreps=5) ## Write to file (just do this once) #write.csv(d, "param_search.csv", row.names=FALSE) ## or read from file to save time since I already ran this :). evolve.params <- read.csv("http://r-research-tool.schwilk.org/assignments/param_search.csv") ## lowest values subset(evolve.params, ngen < 30) mean(subset(evolve.params, ngen < 30)$mu) ## data with mean over 5 reps: mean.ngen <- evolve.params %>% group_by(r,mu) %>% summarize(ngen=mean(ngen)) length(mean.ngen$ngen) # 256 parameter combinations ## 2d plots p1 <- ggplot(mean.ngen, aes(r, ngen, color=mu)) p1 + geom_point() + geom_jitter() p2 <- ggplot(mean.ngen, aes(mu, ngen, color=r)) p2 <- p2 + geom_point() + geom_jitter() p2 ## try faceting mean.ngen$fr <- cut(mean.ngen$r, 5) p3 <- ggplot(mean.ngen, aes(mu, ngen, color=r)) p3 <- p3 + geom_point() + geom_jitter() p3 + facet_grid(fr ~ .) ## contour plots p4 <- ggplot(mean.ngen, aes(x=r, y=mu, z = ngen)) p4 + stat_contour(bins=9) p4 + stat_contour(bins=9) + xlim(c(0,0.2)) ## 3d graph library(lattice) elev.loess <- loess(ngen ~ r*mu, span=0.2,data=evolve.params) mean.ngen$z <- predict(elev.loess, mean.ngen) wireframe(z ~ r*mu, data=mean.ngen, zlab="N gen", drape =TRUE, colorkey=TRUE,screen = list(z=-75, x=-70)) # or, in rgl (interactive!) plot3d(mean.ngen$r, mean.ngen$mu, mean.ngen$ngen, xlab = "r", ylab = "mu", zlab = "ngen", size=1, col=rainbow(length(mean.ngen$ngen))[rank(mean.ngen$ngen)], expand=1, type="s") ## So, r increases convergence speed monotonically, but there is an optimum ## value of mu around 0.006 -- 0.01