## R as a Research Tool
## HW 06: simple evolution simulation
## Solutions
## Instructions
## ------------
## This file contains several complete functions you've seen before:
## mateNumericVectors() and testFitness(). In addition, I've written a new
## function called simpleEvolve(), which creates a haploid population, creates
## a random optimal genotype, and then simulates evolution over a set number of
## generations. This new function depends upon two other functions. One simple
## one I've supplied (getRandChrom), but the other you will write yourself:
## getNextGeneration.
## Your tasks:
## 1. Write the getNextGeneration function (see comment header I supply). Hint:
## look up the "prob" argument to the sample function to see how to choose
## among a list or vector according to weighted probabilities (fitnesses).
## 2. Comment all of the new code (eg everything except mateNumericVectors()
## and testFitness().
## 3. Write code at the end of the script to search through a reasonable number
## of values of mu and r and report which values result in the most rapid
## convergence of the population mean fitness to > 0.9. You will need to
## extract the num.generations element from the result of each run.
###############################################################################
## simple-evolve.R:
## A simple simulation of haploid evolution.
## global constants:
ALLELES <- 1:10 # possible allele values
CHROM_LENGTH <- 10 # length of "chomosome" vectors
POP_SIZE <- 250 # population size
FITNESS_CRITERION <- 0.9 # default mean population fitness at which to end
# simulation
getRandChrom <- function(lngth=CHROM_LENGTH){
return(sample(ALLELES, lngth, replace=TRUE))
}
# mateNumericVectors(x, y, r, mu=0):
# Returns a child vector created by "mating" two parent vectors x and y and
# supplying a per-adjacent-locus recombination rate and a per-locus mutation
# rate. During mutation, an allele is randomly assigned a value between 1 and
# 100, inclusive.
#
# Args:
# - x, y = Two vectors each representing a "genotype". The vectors should be
# atomic vectors containing integer values between 1 and 100
# inclusive. These vectors are the two parent genotypes and must be
# of the same length.
# - r = The per-locus recombination rate. A rate between 0 and 1 which
# indicates the probability of a crossover event between any two
# adjacent genes (vector elements). Rates between 0 and 0.5 make the
# most sense biologically.
# - mu The per-locus mutation rate.
#
# Returns:
# - A vector of the same length as the parents
mateNumericVectors <- function(x, y, r, mu) {
n <- length(x)
stopifnot(n==length(y))
crossovers <- runif(n) <= r
useFirstParent <- cumsum(crossovers) %% 2 == 0
if(runif(1) < 0.5) useFirstParent <- !useFirstParent
child <- ifelse(useFirstParent,x,y)
mutations <- runif(n) <= mu
child[mutations] <- sample(ALLELES, sum(mutations), replace=TRUE)
return(child)
}
# testFitness(genotype, targetGenotype):
# Returns a scalar number representing how well the genotype (a vector of
# numbers) matches the target genotype. The matching is measured as the
# recipricol sum of the squared deviations from the target (1 / (sse+1) ).
#
# Args:
# - genotype = the chromosome to be tested.
# - targetGenotype = The optimum genotype. These must be numeric
# vectors of the same length.
#
# Returns:
# - A scalar: the reciprocal of the sum of the squared deviations
# of the genotype from the target (1 / (sse+1) ).
testFitness <- function(genotype, targetGenotype){
stopifnot(length(genotype)==length(targetGenotype))
return(1.0 / (sum((genotype-targetGenotype)^2) + 1.0 ) )
}
# getNextGeneration(population, r, mu, fitnesses):
#
# Accepts a list of genotypes ("population") and returns the new population
# replacing each individual with the result of mating two vectors chosen
# according to their fitnesses. Note that these are non-overlapping
# generations: the parents should be chosen from the existing generation and
# not from any newly selected offspring. The basic aglorithm: 1) cycle through
# population. 2) for each individual, select a mother and father. These are
# chosen with weighted probability based on their fitnesses (see fitnesses
# argument). 3) Add the offspring to the new population 4) return the new
# population.
#
# Args:
# - population = a list of chromosomes (a list of numeric vectors)
# - r = per-adjacent-locus reocmbination rate
# - mu = a per locus mutation rate
# - fitnesses = a vector of real numbers corresponding to the fitness
# values of each individual in the populations. This must
# be of the same length as population.
#
# Returns:
# - A list of vectors: the new population
getNextGeneration <- function(population, r, mu, fitnesses) {
N = length(population)
parents <- sample(population, N*2, prob=fitnesses, replace = TRUE)
new.pop <- mapply(mateNumericVectors, parents[1:N], parents[(N+1):(N*2)],
r, mu, SIMPLIFY=FALSE)
return(new.pop)
}
## Alternative implementation with loops (slower because of all of the separate
## calls to sample() )
getNextGeneration.loop <- function(population, r, mu, fitnesses) {
# make empty list of correct length for results
new.pop <- vector("list", length=length(population))
# cycle through population and make offspring
for (i in 1:length(population)) {
parents <- sample(population, 2, prob=fitnesses)
new.pop[[i]] <- mateNumericVectors(parents[[1]], parents[[2]], r, mu)
}
return(new.pop)
}
# simpleEvolve(generations, r, mu, break.fit = FITNESS_CRITERION):
#
# This function simulates evolution among haploid individuals. The population
# size is determined by the global constant, POP_SIZE, the chromosome length is
# determined by the global variable CHROM_LENGTH (used in getRandChrom(). Thus
# function accepts arguments controlling the maximum number of generations to
# run, recombination rate, mutation rate and the population mean fitness value
# (0-1) that is the threshold for terminating the simulation. The function
# simulates non-overlappig generations and fitness is aplied at the parent
# selection stage. In other words, all individuals die each generation and are
# replaced with offspring whose parents where selected randomly but weighted by
# the fitnesess of all potential parents.
#
# Args:
# - generations = a scaler integer. The maximum number of generations
# to run
# - r = per-adjacent-locus recombination rate
# - mu = a per locus mutation rate
# - break.fit = a threshold value for the population mean fitness.
# When the mean fitness of all individuals in the
# population meets or exceeds this value, the
# simulation temrinates.
#
# Returns:
# - A list with the following components: 1) "optimal.genotype": the
# target genotype that defined perfect fitness for that run, 2)
# "mean.fit": the mean fitness of the population at simulation end, and
# 3) "num.generations": the number of generations it took to reach that
# mean fitness. The maximum vlaue is determined by the "generations"
# argument.
simpleEvolve <- function(generations, r, mu, break.fit = FITNESS_CRITERION,
do.hist=FALSE) {
optimum <- getRandChrom()
population <- replicate(POP_SIZE, getRandChrom(), simplify=FALSE)
for(gen in 1:generations){
allfits <- sapply(population, testFitness, targetGenotype=optimum)
meanfit <- mean(allfits)
if(do.hist) {
hist(sqrt(allfits),
freq=FALSE, breaks=seq(0,1,0.02),col="gray",
xlab="Square root of fitness")
}
if ( meanfit > break.fit) break
population <- getNextGeneration(population, r, mu, allfits)
}
return(list("optimal.genotype" = optimum, "mean.fit"=meanfit, "num.generations"=gen))
}
## Now search parameter space
searchParams <- function(max.generations, rs, mus, nreps=1) {
df <- data.frame(r = vector("numeric", 0),
mu = vector("numeric", 0),
ngen = vector("numeric", 0))
for(r in rs) {
for(mu in mus) {
for (rep in 1:nreps) {
run <- simpleEvolve(max.generations, r, mu)
print(sprintf("Rep %d, r=%.3f, mu=%.3f, ngen=%d",
rep, r, mu, run$num.generations))
df <- rbind(df, data.frame(r=r, mu=mu, ngen=run$num.generations))
}
}
}
return(df)
}
### examples and parameter search
library(dplyr)
library(ggplot2)
library(rgl)
## simpleEvolve examples with moving histogram:
simpleEvolve(450, r=.01, mu=0.001, do.hist=TRUE)
## simpleEvolve(500, r=0.5, mu=0.001, do.hist= TRUE)
## simpleEvolve(500, r=0.5, mu=0.001, break.fit=0.95)
## Search below is time consuming, so only run if needed.
#d <- searchParams(400, seq(0.0, 0.3, 0.02), seq(0.0, 0.03, 0.002), nreps=5)
#write.csv(d, "param_search.csv", row.names=FALSE)
## or read from file:
#evolve.params <- read.csv("./param_search.csv")
evolve.params <- read.csv("http://r-research-tool.schwilk.org/assignments/param_search.csv")
## lowest values
subset(evolve.params, ngen < 20)
mean(subset(evolve.params, ngen < 30)$mu)
## data with mean over 5 reps:
mean.ngen <- evolve.params %>% group_by(r,mu) %>% summarize(ngen=mean(ngen))
length(mean.ngen$ngen) # 265 parameter combinations
## 2d plots
p1 <- ggplot(mean.ngen, aes(r, ngen, color=mu))
p1 + geom_point() + geom_jitter()
p2 <- ggplot(mean.ngen, aes(mu, ngen, color=r))
p2 <- p2 + geom_point() + geom_jitter()
p2
## try faceting
mean.ngen$fr <- cut(mean.ngen$r, 5)
p3 <- ggplot(mean.ngen, aes(mu, ngen, color=r))
p3 <- p3 + geom_point() + geom_jitter()
p3 + facet_grid(fr ~ .)
## contour plots
p4 <- ggplot(mean.ngen, aes(x=r, y=mu, z = ngen))
p4 + stat_contour(bins=9)
p4 + stat_contour(bins=9) + xlim(c(0,0.2))
## 3d graph
library(lattice)
elev.loess <- loess(ngen ~ r*mu, span=0.2,data=evolve.params)
mean.ngen$z <- predict(elev.loess, mean.ngen)
wireframe(z ~ r*mu, data=mean.ngen, zlab="N gen",
drape =TRUE, colorkey=TRUE,screen = list(z=-75, x=-70))
# or, in rgl (interactive!)
plot3d(mean.ngen$r, mean.ngen$mu, mean.ngen$ngen,
xlab = "r", ylab = "mu", zlab = "ngen", size=1,
col=rainbow(length(mean.ngen$ngen))[rank(mean.ngen$ngen)],
expand=1, type="s")
## So, r increases convergence speed monotonically, but there is an optimum
## value of mu around 0.006 -- 0.01