This analysis serves to do three things:

  • Shift the Liu et al (2014) mouse genetic map so that 0 cM corresponds to 0 Mbp.

  • Calculate a grid of markers across the genome.

  • Get interpolated cM positions for the markers on the GigaMUGA array.

I’ll use the R/qtl2scan package to do interpolations, R/qtl2convert to do some data manipulation, and R/broman for a few minor things. I’ll also use a few other packages below (rvest, AnnotationHub, and devtools).

library(qtl2geno)
library(qtl2scan)
library(qtl2convert)
library(broman)

Download and prepare the maps

I first need to download the Liu et al (2014) maps and do a bit of preparations.

I’ll first ensure that I’ve got directories to contain the files I download plus those I create.

in_dir <- "files"
out_dir <- "results"
for(dir in c(in_dir, out_dir)) {
    if(!dir.exists(dir)) dir.create(dir)
}

Now I’ll download and unzip the Liu et al (2014) map.

url <- "http://cgd.jax.org/mousemapconverter/G2F1.anchor.maps.zip"
file <- file.path(in_dir, basename(url))
if(!file.exists(file)) {
    download.file(url, file)
}
unzip(file, exdir=in_dir)

I’ll read in the sex-averaged, female, and male genetic maps. I’m also going to give each row a name that is like 1_3036178 for chromosome 1 at 3,036,178 bp.

files <- c("avg.map.csv", "female.map.csv", "male.map.csv")
liu_map <- lapply(files, function(f) read.csv(file.path(in_dir, f)))
liu_map <- lapply(liu_map, function(m) { cbind(m, marker=paste(m$Chr, m$Pos, sep="_")) })
names(liu_map) <- lapply(strsplit(files, "\\."), "[", 1)
liu_map <- lapply(liu_map, function(m) {
    colnames(m)[1:2] <- tolower(colnames(m)[1:2])
    m$chr[m$chr==20] <- "X"
    m$chr <- factor(m$chr, c(1:19,"X"))
    m })

Strangely, they have different numbers of positions: the sex-averaged, female, and male maps have 31,731 19,462, and 16,120 positions, respectively.

And it turns out that the basepair positions are from build 37, so we first need to convert them to build 38. (I will separately use liftOver to convert the positions to build 38.)

write.table(cbind(chr=paste0("chr", liu_map$avg$chr), pos=paste(liu_map$avg$pos,liu_map$avg$pos,sep="-")),
            file.path(in_dir, "liu_build37_bp.txt"), sep=":",
            row.names=FALSE, col.names=FALSE, quote=FALSE)

Now reading the liftOver results back in:

liu_build38 <- scan(file.path(out_dir, "liu_build38.txt"), what=character())
liu_build38 <- as.data.frame(matrix(unlist(strsplit(liu_build38, "[:\\-]")), ncol=3, byrow=TRUE),
                             stringsAsFactors=FALSE)[,1:2]
colnames(liu_build38) <- c("chr", "bp_build38")
liu_build38$chr <- factor(substr(liu_build38$chr, 4, nchar(liu_build38$chr)), levels=c(1:19,"X"))
liu_build38$bp_build38 <- as.numeric(liu_build38$bp_build38)
stopifnot(all(liu_build38$chr == liu_map$avg$chr))
liu_map$avg$build38 <- liu_build38$bp_build38

As it turns out, there’s one pair of positions that are inverted in build 38 vs build 37 (on chr 4), the middle two markers here:

liu_map$avg[liu_map$avg$chr==4,][447:450,]
##      chr      pos       cM     marker  build38
## 6830   4 41712580 18.21238 4_41712580 41765707
## 6831   4 42209315 18.49628 4_42209315 42795721
## 6832   4 42229812 18.50053 4_42229812 42775224
## 6833   4 42900046 18.56148 4_42900046 42887174

These markers are present in the male map but not the female map. We’ll omit the marker at build 37 position 42,229,812.

drop <- 42229812
for(m in c("avg", "female", "male")) {
    map <- liu_map[[m]]
    map <- map[!(map$chr==4 & map$pos==drop),]
    liu_map[[m]] <- map
}

I next add the build 38 positions to the female and male maps.

for(m in c("female", "male")) {
    liu_map[[m]]$build38 <- liu_map$avg$build38[match(liu_map[[m]]$marker, liu_map$avg$marker)]
}

Now to align the maps and make a single data frame. I’m going to split them into lists and interpolate positions in the female and male maps so that they are all the same length. Note that the male map doesn’t include the X chromosome

liu_pmap_list <- lapply(liu_map, map_df_to_list, chr_column="chr", pos_column="build38")
liu_gmap_list <- lapply(liu_map, map_df_to_list, chr_column="chr", pos_column="cM")
liu_gmap_list$female <- interp_map(liu_pmap_list$avg, liu_pmap_list$female, liu_gmap_list$female)
liu_gmap_list$male <- interp_map(liu_pmap_list$avg[1:19], liu_pmap_list$male, liu_gmap_list$male)

Now I’ll combine everything into one data frame.

liu_map <- cbind(liu_map$avg[,c("marker", "chr", "build38", "pos", "cM")],
                 cM_female = map_list_to_df(liu_gmap_list$female, pos_column="cM_female")[,"cM_female"],
                 cM_male = c(map_list_to_df(liu_gmap_list$male, pos_column="cM_male")[,"cM_male"],
                             rep(NA, sum(liu_map$avg$chr=="X"))))
colnames(liu_map)[3:4] <- c("bp_build38", "bp_build37")

Get the chromosome lengths

Now we want to anchor the maps at 0 cM. I also want to anchor them at the telomeres. So I first need to get the chromosome lengths. I’ll start by grabbing the build 38 from the web, at https://www.ncbi.nlm.nih.gov/grc/mouse/data".

I use rvest to scrape the chromosome lengths from the web page.

library(rvest)
url <- "https://www.ncbi.nlm.nih.gov/grc/mouse/data"
file <- file.path(in_dir, "grc_mouse_genome_assembly.html")
if(!file.exists(file)) download.file(url, file)
z <- read_html(file)
tab <- html_nodes(z, css="table.ui-ncbigrid")
tab <- html_table(tab)[[1]]
mm10_L <- as.numeric(sapply(strsplit(tab[,2], ","), paste, collapse=""))[1:20]
names(mm10_L) <- c(1:19,"X")

Let’s double-check these using AnnotationHub. I’m grabbing the cytobands which seem to be the smallest thing that will give me full lengths.

file <- file.path(in_dir, "ah51380.rds")
if(file.exists(file)) {
    zz <- readRDS(file)
} else {
    library(AnnotationHub)
    ah <- AnnotationHub()
    z <- query(ah, "mm10")
    zz <- as.data.frame(z[["AH53180"]]) # cytoband track
    saveRDS(zz, file)
}
mm10_Lb <- tapply(zz$end, zz$seqnames, max)
mm10_Lb <- setNames(mm10_Lb, substr(names(mm10_Lb), 4, nchar(mm10_Lb)))[1:20]

These are the same as those from the web.

And are all longer than Liu map? TRUE. Yay!

Shift the maps

Okay, so now we’ve got the Liu et al. maps into build 38 and with the sex-averaged, female, and male maps aligned. Next we want to add the 0 position plus the telomere and then determine the correponding cM positions.

I’m going to do the split, interpolate, recombine, again.

First split:

pmap <- map_df_to_list(liu_map, pos_column="bp_build38")
gmap_ave <- map_df_to_list(liu_map, pos_column="cM")
gmap_fem <- map_df_to_list(liu_map, pos_column="cM_female")
gmap_mal <- map_df_to_list(liu_map, pos_column="cM_male")

Now anchor the chromosomes at 0 and the telomere:

pmap_anchored <- pmap
for(chr in seq_along(pmap)) {
    pmap_anchored[[chr]] <- c(0, pmap[[chr]], mm10_L[chr])
    names(pmap_anchored[[chr]])[c(1, length(pmap_anchored[[chr]]))] <-
        paste0(c("zero", "telo"), names(pmap[chr]))
}

Now I do the interpolation to get the cM positions for the zero and telomere points.

gmap_ave <- interp_map(pmap_anchored, pmap, gmap_ave)
gmap_fem <- interp_map(pmap_anchored, pmap, gmap_fem)
gmap_mal <- interp_map(pmap_anchored, pmap, gmap_mal)

Now we shift the genetic maps so they start at 0.

gmap_ave <- lapply(gmap_ave, function(a) a-min(a))
gmap_fem <- lapply(gmap_fem, function(a) a-min(a))
gmap_mal <- lapply(gmap_mal, function(a) a-min(a))

Finally, we bring them back together into a data frame.

liu_map <- cbind(map_list_to_df(pmap_anchored, pos_column="bp_build38"),
                 cM=map_list_to_df(gmap_ave, pos_column="cM")[,"cM"],
                 cM_female=map_list_to_df(gmap_ave, pos_column="cM_female")[,"cM_female"],
                 cM_male=map_list_to_df(gmap_ave, pos_column="cM_male")[,"cM_male"])

We also need to change the position names to use the build38 positions.

wh <- grep("_", liu_map$marker)
rownames(liu_map)[wh] <- liu_map$marker[wh] <- paste(liu_map$chr[wh], liu_map$bp_build38[wh], sep="_")

Let’s grab the physical and genetic maps as lists, for later use.

liu_pmap <- map_df_to_list(liu_map, pos_column="bp_build38")
liu_gmap <- map_df_to_list(liu_map, pos_column="cM")

Interpolate GigaMUGA cM

Let’s now get the interpolated cM positions for the GigaMUGA markers. We first download the SNP map file and load it into R.

url <- "ftp://ftp.jax.org/MUGA/GM_snps.Rdata"
file <- file.path(in_dir, basename(url))
if(!file.exists(file)) {
    download.file(url, file)
}
load(file)

Now we split the physical positions into a list, first subsetting to the autosomes and the X chromosome. Also, we convert the Mbp positions to basepairs.

GM_pmap <- map_df_to_list(GM_snps[GM_snps$chr %in% c(1:19,"X"),], pos_column="pos")
GM_pmap <- lapply(GM_pmap, function(a) round(a*1e6))

And now let’s get interplated cM positions using the Liu et al. map.

GM_gmap <- interp_map(GM_pmap, liu_pmap, liu_gmap)

Finally, we paste the cM positions back into the GM_snps object.

GM_snps$cM[GM_snps$chr %in% c(1:19,"X")] <- map_list_to_df(GM_gmap, pos_column="cM")[,"cM"]

I’ll also make the chr column a factor, with levels 1, 2, …, 19, X, Y, and M.

GM_snps$chr <- factor(GM_snps$chr, levels=c(1:19,"X", "Y", "M"))

Interpolate MegaMUGA cM

Let’s do the same thing with the MegaMUGA markers. We first download the SNP map file and load it into R.

url <- "ftp://ftp.jax.org/MUGA/MM_snps.Rdata"
file <- file.path(in_dir, basename(url))
if(!file.exists(file)) {
    download.file(url, file)
}
load(file)

Note that the MM_snps object has markers on chr 1-19, X, Y, P, and 22. No missing values in the chr column, and no notations for the mitochondria. But there are missing values in the pos column.

We split the physical positions into a list, first subsetting to the autosomes and the X chromosome (and omitting snps missing physical positions). Also, we convert the Mbp positions to basepairs.

MM_keep <- !is.na(MM_snps$pos) & MM_snps$chr %in% c(1:19,"X")
MM_pmap <- map_df_to_list(MM_snps[MM_keep,], pos_column="pos")
MM_pmap <- lapply(MM_pmap, function(a) round(a*1e6))

And now let’s get interplated cM positions using the Liu et al. map.

MM_gmap <- interp_map(MM_pmap, liu_pmap, liu_gmap)

Finally, we paste the cM positions back into the MM_snps object.

MM_snps$cM[MM_keep] <- map_list_to_df(MM_gmap, pos_column="cM")[,"cM"]

Genome grid in cM

Let’s now calculate a grid along the genetic map, and find the corresponding bp positions. On each chromosome, I’ll have the grid start at 3 Mbp and go to the telomere. (Traditionally, the mouse chromosomes sequence all start at 3 Mbp, leaving space for the centromere.)

So first we need to add a position at 3 Mb for each chromosome and interpolate to get the corresponding cM location.

liu_pmap_3Mbp <- lapply(liu_pmap, function(a) sort(c(a, start=3e6)))
liu_gmap_3Mbp <- interp_map(liu_pmap_3Mbp, liu_pmap, liu_gmap)

Now we want to form a grid starting at the second position on each chromosome and ending at the telomere. KB Choi’s 64k grid is a nice round number, but the inter-marker distances are slightly different between chromosomes. I’d be inclined to go with a 0.02 cM grid, which would give 68,433 positions, or a 0.01 cM grid, which would give 136,854 positions.

Let’s go with a 0.02 cM grid for now.

grid_gmap <- lapply(liu_gmap_3Mbp, function(a) seq(a[2], a[length(a)], by=0.02))

And next get interpolated physical positions, rounding to basepairs.

grid_pmap <- interp_map(grid_gmap, liu_gmap_3Mbp, liu_pmap_3Mbp)
grid_pmap <- lapply(grid_pmap, round)

Now, we add names.

for(i in seq_along(grid_pmap)) {
    names(grid_pmap[[i]]) <- names(grid_gmap[[i]]) <-
        paste(names(grid_pmap)[i], grid_pmap[[i]], sep="_")
}

And we put them back together as a data frame.

grid <- cbind(map_list_to_df(grid_pmap, pos_column="bp"),
              cM=map_list_to_df(grid_gmap, pos_column="cM")[,"cM"])
rownames(grid) <- grid$marker
grid$pos <- grid$bp/1e6
grid <- grid[,c("marker", "chr", "pos", "cM", "bp")]

The resulting grid object has 68,433 rows. The columns are slightly different than the snps object in the DO378_islet_v2.RData file, in that the snps object has a column called bp that is actually Mbp and a column pos that is basepairs. The new grid object has the same content, but the third column with Mbp positions is called pos and the final column with basepairs is called bp.

RNAseq gene positions on grid

We won’t download the islet RNA-seq data, because it’s not yet officially public, but we grabbed the file DO378_islet_v2.RData from the Dan Gatti’s directory on the JAX ftp site.

file <- file.path(in_dir, "DO378_islet_v2.RData")
load(file)

The object snps is the previous 64k grid with positions evenly spaced in cM. (Again note that the spacing is slightly different between chromosomes.)

The object we want is annot.mrna. The chr column has 1-19, X, Y, and MT. There are columns start, end, and middle_point, all in basepairs, with middle_point being round((start+end)/2). The column nearest_snp has integers in the range 1-64k, with missing values for the genes on the Y and MT; this is the index of the nearest point on the 64k SNP grid. We want to replace those values with the indices corresponding to our new grid.

Here’s a function to do the job for a single position:

find_nearest <-
    function(chr, pos, map, index)
{
    if(!(chr %in% names(map))) return(NA)
    index[[chr]][ which.min( abs(pos - map[[chr]]) ) ]
}

Now we pull out the grid positions as a list and calculate a corresponding list of grid indices.

grid_pmap <- map_df_to_list(grid, pos_column="bp")
grid_index <- map_df_to_list(cbind(grid, index=1:nrow(grid)), pos_column="index")

Finally, we calculate the new grid indices and plug them into the object.

index <- rep(NA, nrow(annot.mrna))
for(i in 1:nrow(annot.mrna)) {
    index[i] <- find_nearest(annot.mrna$chr[i], annot.mrna$middle_point[i],
                             grid_pmap, grid_index)
}
annot.mrna_newgrid <- annot.mrna
annot.mrna_newgrid$nearest_snp <- index

One concern about the new grid

One potential issue to keep in mind: the Liu et al. map has some long regions with very low recombination, and so this new 0.02 cM grid has some pretty big physical gaps, particularly on chromosome 14. Across the whole genome, there are 82 gaps > 1 Mbp, and 11 gaps > 2 Mbp, and then chromosome 14 has two gaps > 5 Mbp.

In contrast, KB Choi’s original grid had 0 gaps > 2 Mbp and just 8 gaps > 1 Mbp.

One result of these physical gaps in the grid is that the many of the genes in annot.mrna are rather far away from the closest grid position.

With the original grid, of the 21,771 genes on the autosomes or X chromosome, 0 are more than 1 Mbp away from a grid position, and just 8 are more than 500 kbp away from a grid position.

With the new grid, 36 are more than 1 Mbp away from a grid position, and 129 are more than 500 kbp away from a grid position. There are 7 genes more than 2 Mbp from a grid position, all on chr 14. The maximum distance is 2.8 Mbp.

Add more pseudomarkers

To deal with the physical gaps between grid points, I’m inclined to add additional pseudomarkers so that no two points are more than 0.5 Mbp apart.

grid_pmap <- lapply(grid_pmap, function(a) a/1e6) # pmap in Mbp rather than bp

grid_pmap_plus <- insert_pseudomarkers(grid_pmap, step=0.5, stepwidth="max")
for(i in seq_along(grid_pmap_plus)) {
    names(grid_pmap_plus[[i]]) <- paste(names(grid_pmap)[i], round(grid_pmap_plus[[i]]*1e6), sep="_")
}
grid_gmap_plus <- interp_map(grid_pmap_plus, grid_pmap, grid_gmap)

grid_plus <- cbind(map_list_to_df(grid_pmap_plus, pos_column="pos"),
              cM=map_list_to_df(grid_gmap_plus, pos_column="cM")[,"cM"])
rownames(grid_plus) <- grid_plus$marker
grid_plus$bp <- round(grid_plus$pos*1e6)
grid_plus <- grid_plus[,c("marker", "chr", "pos", "cM", "bp")]

This increases the number of grid points from 68433 to 69005 (that is, an additional 572 points).

Save stuff to files

Now let’s save everything to files. I’m going to use .rds files rather than .RData files, as they each will contain one object.

saveRDS(liu_map, file.path(out_dir, "liu_map.rds"))
saveRDS(GM_snps, file.path(out_dir, "GM_snps_v2.rds"))
saveRDS(MM_snps, file.path(out_dir, "MM_snps_v2.rds"))
saveRDS(grid, file.path(out_dir, "grid_0.02cM.rds"))
saveRDS(grid_plus, file.path(out_dir, "grid_0.02cM_plus.rds"))
saveRDS(annot.mrna_newgrid, file.path(out_dir, "annot_mrna_0.02cM_grid.rds"))

Session Info

devtools::session_info()
##  setting  value                       
##  version  R version 3.4.2 (2017-09-28)
##  system   x86_64, darwin15.6.0        
##  ui       X11                         
##  language en_US.UTF-8                 
##  collate  en_US.UTF-8                 
##  tz       America/Chicago             
##  date     2017-11-18                  
## 
##  package     * version  date       source        
##  assertthat    0.2.0    2017-04-11 CRAN (R 3.4.0)
##  backports     1.1.1    2017-09-25 CRAN (R 3.4.2)
##  base        * 3.4.2    2017-10-04 local         
##  broman      * 0.66-2   2017-10-24 local         
##  compiler      3.4.2    2017-10-04 local         
##  data.table    1.10.4-3 2017-10-27 CRAN (R 3.4.2)
##  datasets    * 3.4.2    2017-10-04 local         
##  devtools      1.13.3   2017-08-02 CRAN (R 3.4.1)
##  digest        0.6.12   2017-01-27 CRAN (R 3.4.0)
##  evaluate      0.10.1   2017-06-24 CRAN (R 3.4.0)
##  graphics    * 3.4.2    2017-10-04 local         
##  grDevices   * 3.4.2    2017-10-04 local         
##  htmltools     0.3.6    2017-04-28 CRAN (R 3.4.0)
##  httr          1.3.1    2017-08-20 CRAN (R 3.4.1)
##  knitr         1.17     2017-08-10 CRAN (R 3.4.1)
##  magrittr      1.5      2014-11-22 CRAN (R 3.4.0)
##  memoise       1.1.0    2017-04-21 CRAN (R 3.4.0)
##  methods     * 3.4.2    2017-10-04 local         
##  parallel      3.4.2    2017-10-04 local         
##  qtl           1.42-2   2017-09-13 local         
##  qtl2convert * 0.5-10   2017-11-18 local         
##  qtl2geno    * 0.5-37   2017-11-18 local         
##  qtl2scan    * 0.5-25   2017-11-18 local         
##  R6            2.2.2    2017-06-17 CRAN (R 3.4.0)
##  Rcpp          0.12.13  2017-09-28 CRAN (R 3.4.2)
##  rmarkdown     1.6      2017-06-15 CRAN (R 3.4.0)
##  rprojroot     1.2      2017-01-16 CRAN (R 3.4.0)
##  rvest       * 0.3.2    2016-06-17 CRAN (R 3.4.0)
##  selectr       0.3-1    2016-12-19 CRAN (R 3.4.0)
##  stats       * 3.4.2    2017-10-04 local         
##  stringi       1.1.5    2017-04-07 CRAN (R 3.4.0)
##  stringr       1.2.0    2017-02-18 CRAN (R 3.4.0)
##  tools         3.4.2    2017-10-04 local         
##  utils       * 3.4.2    2017-10-04 local         
##  withr         2.0.0    2017-07-28 CRAN (R 3.4.1)
##  XML           3.98-1.9 2017-06-19 CRAN (R 3.4.0)
##  xml2        * 1.1.1    2017-01-24 CRAN (R 3.4.0)
##  yaml          2.1.14   2016-11-12 CRAN (R 3.4.0)