Homework 06 Solutions

## R as a Research Tool HW 06: simple evolution simulation solutions

## Instructions
## ------------

## This file contains several complete functions you've seen before:
## mateNumericVectors() and testFitness(). In addition, I've written a new
## function called simpleEvolve(), which creates a haploid population, creates
## a random optimal genotype, and then simulates evolution over a set number of
## generations. This new function depends upon two other functions. One simple
## one I've supplied (getRandChrom), but the other you will write yourself:
## getNextGeneration.

## Your tasks:

## 1. Write the getNextGeneration function (see comment header I supply). Hint:
## look up the "prob" argument to the sample function to see how to choose
## among a list or vector according to weighted probabilities (fitnesses).

## 2. Comment all of the new code (eg everything except mateNumericVectors()
## and testFitness().

## 3. Write code at the end of the script to search through a reasonable number
## of values of mu and r and report which values result in the most rapid
## convergence of the population mean fitness to > 0.9. You will need to
## extract the num.generations element from the result of each run.


###############################################################################
## simple-evolve.R:
## A simple simulation of haploid evolution.

## global constants:
ALLELES <- 1:10           # possible allele values
CHROM_LENGTH <- 10        # length of "chomosome" vectors
POP_SIZE <- 100           # population size
FITNESS_CRITERION <- 0.9  # default mean population fitness at which to end
                          # simulation


getRandChrom <- function(lngth=CHROM_LENGTH){
    return(sample(ALLELES, lngth, replace=TRUE))
}

# mateNumericVectors(x, y, r, mu=0):
# Returns a child vector created by "mating" two parent vectors x and y and
# supplying a per-adjacent-locus recombination rate and a per-locus mutation
# rate. During mutation, an allele is randomly assigned a value between 1 and
# 100, inclusive.
#
#   Args:
#    - x, y = Two vectors each representing a "genotype". The vectors should be
#             atomic vectors containing integer values between 1 and 100
#             inclusive. These vectors are the two parent genotypes and must be
#             of the same length.
#    - r    = The per-locus recombination rate. A rate between 0 and 1 which
#             indicates the probability of a crossover event between any two
#             adjacent genes (vector elements).  Rates between 0 and 0.5 make the 
#             most sense biologically.
#   - mu      The per-locus mutation rate.
#
#  Returns:
#         - A vector of the same length as the parents
mateNumericVectors <- function(x, y, r, mu) {
    n <- length(x)
    stopifnot(n==length(y))

    crossovers <- runif(n) <= r
    useFirstParent  <- cumsum(crossovers) %% 2 == 0

    if(runif(1) < 0.5) useFirstParent <- !useFirstParent
    child <- ifelse(useFirstParent,x,y)

    mutations <- runif(n) <= mu
    child[mutations] <- sample(ALLELES, sum(mutations), replace=TRUE)

  return(child)
}

# testFitness(genotype, targetGenotype):
# Returns a scalar number representing how well the genotype (a vector of
# numbers) matches the target genotype. The matching is measured as the 
# recipricol sum of the squared deviations from the target (1 / (sse+1) ).
#
#     Args:
#         - genotype       = the chromosome to be tested.
#         - targetGenotype = The optimum genotype. These must be numeric
#                            vectors of the same length.
#
#  Returns:
#         - A scalar: the reciprocal of the sum of the squared deviations 
#           of the genotype from the target  (1 / (sse+1) ).
testFitness <- function(genotype, targetGenotype){
    stopifnot(length(genotype)==length(targetGenotype))
    return(1.0 / (sum((genotype-targetGenotype)^2) + 1.0 ) )
}


# getNextGeneration(population, r, mu, fitnesses):
#
# Accepts a list of genotypes ("population") and returns the new population
# replacing each individual with the result of mating two vectors chosen
# according to their fitnesses. Note that these are non-overlapping
# generations: the parents should be chosen from the existing generation and
# not from any newly selected offspring. The basic algorithm: 1) for each spot
# in the new population select two parents from the current population. These
# are chosen with weighted probability based on their fitnesses (see fitnesses
# argument). 2) Use mateVectors to produce an offspring and add the offspring
# to the new population 4) When we have a new population of the same size as
# the original, return the new population.
#
#   Args:
#     - population     = a list of chromosomes (a list of numeric vectors)
#     - r              = per-adjacent-locus reocmbination rate
#     - mu             = a per locus mutation rate
#     - fitnesses = a vector of real numbers corresponding to the fitness
#                   values of each individual in the populations. This must
#                   be of the same length as population.
#
#  Returns:
#         - A list of vectors: the new population
getNextGeneration <- function(population, r, mu, fitnesses) {
    N = length(population)
    parents <- sample(population, N*2, prob=fitnesses, replace = TRUE)
    new_pop <- mapply(mateNumericVectors, parents[1:N], parents[(N+1):(N*2)],
                      r, mu, SIMPLIFY=FALSE)
    return(new_pop)
}

## Alternative implementation with loops (slower because of all of the separate
## calls to sample() )
getNextGeneration.loop <- function(population, r, mu, fitnesses) {
    # make empty list of correct length for results
    new.pop <- vector("list", length=length(population))
    # cycle through population and make offspring
    for (i in 1:length(population)) {
        parents <- sample(population, 2, prob=fitnesses, replace=TRUE) # allow selfing
        new.pop[[i]] <- mateNumericVectors(parents[[1]], parents[[2]], r, mu)
    }
    return(new.pop)
}

## DWS it would also be possible to use a loop but first get all of the
## potential parents as in the first version.


# simpleEvolve(generations, r, mu, break.fit = FITNESS_CRITERION):
#
# This function simulates evolution among haploid individuals. The population
# size is determined by the global constant, POP_SIZE, the chromosome length is
# determined by the global variable CHROM_LENGTH (used in getRandChrom(). Thus
# function accepts arguments controlling the maximum number of generations to
# run, recombination rate, mutation rate and the population mean fitness value
# (0-1) that is the threshold for terminating the simulation. The function
# simulates non-overlappig generations and fitness is aplied at the parent
# selection stage. In other words, all individuals die each generation and are
# replaced with offspring whose parents where selected randomly but weighted by
# the fitnesess of all potential parents.
#
#   Args:
#     - generations    = a scaler integer. The maximum number of generations
#                        to run
#     - r              = per-adjacent-locus recombination rate
#     - mu             = a per locus mutation rate
#     - break.fit      = a threshold value for the population mean fitness.
#                        When the mean fitness of all individuals in the
#                        population meets or exceeds this value, the
#                        simulation temrinates.
#     - do.hist =        flag to set plotting output side effect. Shows moving
#                         histogram of population fitnesses when do.hist=TRUE
#
#  Returns:
#         - A list with the following components: 1) "optimal.genotype": the
#         target genotype that defined perfect fitness for that run, 2)
#         "mean.fit": the mean fitness of the population at simulation end, and
#         3) "num.generations": the number of generations it took to reach that
#         mean fitness. The maximum vlaue is determined by the "generations"
#         argument.
simpleEvolve <- function(generations, r, mu, break.fit = FITNESS_CRITERION,
                         do.hist=FALSE) {
    optimum <- getRandChrom()
    population <- replicate(POP_SIZE, getRandChrom(), simplify=FALSE)

    for(gen in 1:generations){
        allfits <- sapply(population, testFitness, targetGenotype=optimum)
        meanfit <- mean(allfits)

        if(do.hist) {
            hist(sqrt(allfits),
                 freq=FALSE, breaks=seq(0,1,0.02),col="gray",
                 xlab="Square root of fitness")
        }

        if ( meanfit > break.fit) break
        population <- getNextGeneration(population, r, mu, allfits)
    }
  return(list("optimal.genotype" = optimum, "mean.fit"=meanfit, "num.generations"=gen))
 }


###############################################################################
## Part 3
##############################################################################


## search parameter space
searchParams <- function(max.generations, rs, mus, nreps=1) {
    df <- data.frame(r = vector("numeric", 0),
                     mu = vector("numeric", 0),
                     ngen =  vector("numeric", 0))
    for(r in rs) {
        for(mu in mus) {
            for (rep in 1:nreps) {
                run <- simpleEvolve(max.generations, r, mu)
                print(sprintf("Rep %d, r=%.3f, mu=%.3f, ngen=%d",
                              rep, r, mu, run$num.generations))
                df <- rbind(df, data.frame(r=r, mu=mu, ngen=run$num.generations))
            }
        }
    }
    return(df)
}


### examples and parameter search
library(dplyr)
library(ggplot2)
library(rgl)


## simpleEvolve examples with moving histogram:
simpleEvolve(450, r=.01, mu=0.001, do.hist=TRUE)
## simpleEvolve(500, r=0.5, mu=0.001, do.hist= TRUE)
## simpleEvolve(500, r=0.5, mu=0.001, break.fit=0.95)

## Search below is time consuming, so only run if needed.
# d <- searchParams(400, seq(0.0, 0.3, 0.02), seq(0.0, 0.03, 0.002), nreps=5)
## Write to file (just do this once)
#write.csv(d, "param_search.csv", row.names=FALSE)

## or read from file to save time since I already ran this :).
evolve.params <- read.csv("http://r-research-tool.schwilk.org/assignments/param_search.csv")

## lowest values
subset(evolve.params, ngen < 30)
mean(subset(evolve.params, ngen < 30)$mu)

## data with mean over 5 reps:
mean.ngen <- evolve.params %>% group_by(r,mu) %>% summarize(ngen=mean(ngen))
length(mean.ngen$ngen) # 256 parameter combinations

## 2d plots
p1 <- ggplot(mean.ngen, aes(r, ngen, color=mu))
p1 + geom_point() + geom_jitter()

p2 <- ggplot(mean.ngen, aes(mu, ngen, color=r))
p2 <- p2 +  geom_point() + geom_jitter()
p2

## try faceting
mean.ngen$fr <- cut(mean.ngen$r, 5)
p3 <- ggplot(mean.ngen, aes(mu, ngen, color=r))
p3 <- p3 +  geom_point() + geom_jitter()
p3 + facet_grid(fr ~ .)


## contour plots
p4 <- ggplot(mean.ngen, aes(x=r, y=mu, z = ngen))
p4 + stat_contour(bins=9)

p4 + stat_contour(bins=9) + xlim(c(0,0.2))


## 3d graph
library(lattice)
elev.loess <- loess(ngen ~ r*mu, span=0.2,data=evolve.params)
mean.ngen$z <- predict(elev.loess, mean.ngen)
wireframe(z ~ r*mu, data=mean.ngen, zlab="N gen",
          drape =TRUE, colorkey=TRUE,screen = list(z=-75, x=-70))

# or, in rgl (interactive!)
plot3d(mean.ngen$r, mean.ngen$mu, mean.ngen$ngen,
       xlab = "r", ylab = "mu", zlab = "ngen", size=1,
       col=rainbow(length(mean.ngen$ngen))[rank(mean.ngen$ngen)],
       expand=1, type="s")

## So, r increases convergence speed monotonically, but there is an optimum
## value of mu around 0.006 -- 0.01

Back to top | E-mail Schwilk