This analysis serves to do three things:
Shift the Liu et al (2014) mouse genetic map so that 0 cM corresponds to 0 Mbp.
Calculate a grid of markers across the genome.
Get interpolated cM positions for the markers on the GigaMUGA array.
I’ll use the R/qtl2scan package to do interpolations, R/qtl2convert to do some data manipulation, and R/broman for a few minor things. I’ll also use a few other packages below (rvest, AnnotationHub, and devtools).
library(qtl2geno)
library(qtl2scan)
library(qtl2convert)
library(broman)
I first need to download the Liu et al (2014) maps and do a bit of preparations.
I’ll first ensure that I’ve got directories to contain the files I download plus those I create.
in_dir <- "files"
out_dir <- "results"
for(dir in c(in_dir, out_dir)) {
if(!dir.exists(dir)) dir.create(dir)
}
Now I’ll download and unzip the Liu et al (2014) map.
url <- "http://cgd.jax.org/mousemapconverter/G2F1.anchor.maps.zip"
file <- file.path(in_dir, basename(url))
if(!file.exists(file)) {
download.file(url, file)
}
unzip(file, exdir=in_dir)
I’ll read in the sex-averaged, female, and male genetic maps. I’m also going to give each row a name that is like 1_3036178
for chromosome 1 at 3,036,178 bp.
files <- c("avg.map.csv", "female.map.csv", "male.map.csv")
liu_map <- lapply(files, function(f) read.csv(file.path(in_dir, f)))
liu_map <- lapply(liu_map, function(m) { cbind(m, marker=paste(m$Chr, m$Pos, sep="_")) })
names(liu_map) <- lapply(strsplit(files, "\\."), "[", 1)
liu_map <- lapply(liu_map, function(m) {
colnames(m)[1:2] <- tolower(colnames(m)[1:2])
m$chr[m$chr==20] <- "X"
m$chr <- factor(m$chr, c(1:19,"X"))
m })
Strangely, they have different numbers of positions: the sex-averaged, female, and male maps have 31,731 19,462, and 16,120 positions, respectively.
And it turns out that the basepair positions are from build 37, so we first need to convert them to build 38. (I will separately use liftOver to convert the positions to build 38.)
write.table(cbind(chr=paste0("chr", liu_map$avg$chr), pos=paste(liu_map$avg$pos,liu_map$avg$pos,sep="-")),
file.path(in_dir, "liu_build37_bp.txt"), sep=":",
row.names=FALSE, col.names=FALSE, quote=FALSE)
Now reading the liftOver results back in:
liu_build38 <- scan(file.path(out_dir, "liu_build38.txt"), what=character())
liu_build38 <- as.data.frame(matrix(unlist(strsplit(liu_build38, "[:\\-]")), ncol=3, byrow=TRUE),
stringsAsFactors=FALSE)[,1:2]
colnames(liu_build38) <- c("chr", "bp_build38")
liu_build38$chr <- factor(substr(liu_build38$chr, 4, nchar(liu_build38$chr)), levels=c(1:19,"X"))
liu_build38$bp_build38 <- as.numeric(liu_build38$bp_build38)
stopifnot(all(liu_build38$chr == liu_map$avg$chr))
liu_map$avg$build38 <- liu_build38$bp_build38
As it turns out, there’s one pair of positions that are inverted in build 38 vs build 37 (on chr 4), the middle two markers here:
liu_map$avg[liu_map$avg$chr==4,][447:450,]
## chr pos cM marker build38
## 6830 4 41712580 18.21238 4_41712580 41765707
## 6831 4 42209315 18.49628 4_42209315 42795721
## 6832 4 42229812 18.50053 4_42229812 42775224
## 6833 4 42900046 18.56148 4_42900046 42887174
These markers are present in the male map but not the female map. We’ll omit the marker at build 37 position 42,229,812.
drop <- 42229812
for(m in c("avg", "female", "male")) {
map <- liu_map[[m]]
map <- map[!(map$chr==4 & map$pos==drop),]
liu_map[[m]] <- map
}
I next add the build 38 positions to the female and male maps.
for(m in c("female", "male")) {
liu_map[[m]]$build38 <- liu_map$avg$build38[match(liu_map[[m]]$marker, liu_map$avg$marker)]
}
Now to align the maps and make a single data frame. I’m going to split them into lists and interpolate positions in the female and male maps so that they are all the same length. Note that the male map doesn’t include the X chromosome
liu_pmap_list <- lapply(liu_map, map_df_to_list, chr_column="chr", pos_column="build38")
liu_gmap_list <- lapply(liu_map, map_df_to_list, chr_column="chr", pos_column="cM")
liu_gmap_list$female <- interp_map(liu_pmap_list$avg, liu_pmap_list$female, liu_gmap_list$female)
liu_gmap_list$male <- interp_map(liu_pmap_list$avg[1:19], liu_pmap_list$male, liu_gmap_list$male)
Now I’ll combine everything into one data frame.
liu_map <- cbind(liu_map$avg[,c("marker", "chr", "build38", "pos", "cM")],
cM_female = map_list_to_df(liu_gmap_list$female, pos_column="cM_female")[,"cM_female"],
cM_male = c(map_list_to_df(liu_gmap_list$male, pos_column="cM_male")[,"cM_male"],
rep(NA, sum(liu_map$avg$chr=="X"))))
colnames(liu_map)[3:4] <- c("bp_build38", "bp_build37")
Now we want to anchor the maps at 0 cM. I also want to anchor them at the telomeres. So I first need to get the chromosome lengths. I’ll start by grabbing the build 38 from the web, at https://www.ncbi.nlm.nih.gov/grc/mouse/data".
I use rvest to scrape the chromosome lengths from the web page.
library(rvest)
url <- "https://www.ncbi.nlm.nih.gov/grc/mouse/data"
file <- file.path(in_dir, "grc_mouse_genome_assembly.html")
if(!file.exists(file)) download.file(url, file)
z <- read_html(file)
tab <- html_nodes(z, css="table.ui-ncbigrid")
tab <- html_table(tab)[[1]]
mm10_L <- as.numeric(sapply(strsplit(tab[,2], ","), paste, collapse=""))[1:20]
names(mm10_L) <- c(1:19,"X")
Let’s double-check these using AnnotationHub. I’m grabbing the cytobands which seem to be the smallest thing that will give me full lengths.
file <- file.path(in_dir, "ah51380.rds")
if(file.exists(file)) {
zz <- readRDS(file)
} else {
library(AnnotationHub)
ah <- AnnotationHub()
z <- query(ah, "mm10")
zz <- as.data.frame(z[["AH53180"]]) # cytoband track
saveRDS(zz, file)
}
mm10_Lb <- tapply(zz$end, zz$seqnames, max)
mm10_Lb <- setNames(mm10_Lb, substr(names(mm10_Lb), 4, nchar(mm10_Lb)))[1:20]
These are the same as those from the web.
And are all longer than Liu map? TRUE. Yay!
Okay, so now we’ve got the Liu et al. maps into build 38 and with the sex-averaged, female, and male maps aligned. Next we want to add the 0 position plus the telomere and then determine the correponding cM positions.
I’m going to do the split, interpolate, recombine, again.
First split:
pmap <- map_df_to_list(liu_map, pos_column="bp_build38")
gmap_ave <- map_df_to_list(liu_map, pos_column="cM")
gmap_fem <- map_df_to_list(liu_map, pos_column="cM_female")
gmap_mal <- map_df_to_list(liu_map, pos_column="cM_male")
Now anchor the chromosomes at 0 and the telomere:
pmap_anchored <- pmap
for(chr in seq_along(pmap)) {
pmap_anchored[[chr]] <- c(0, pmap[[chr]], mm10_L[chr])
names(pmap_anchored[[chr]])[c(1, length(pmap_anchored[[chr]]))] <-
paste0(c("zero", "telo"), names(pmap[chr]))
}
Now I do the interpolation to get the cM positions for the zero and telomere points.
gmap_ave <- interp_map(pmap_anchored, pmap, gmap_ave)
gmap_fem <- interp_map(pmap_anchored, pmap, gmap_fem)
gmap_mal <- interp_map(pmap_anchored, pmap, gmap_mal)
Now we shift the genetic maps so they start at 0.
gmap_ave <- lapply(gmap_ave, function(a) a-min(a))
gmap_fem <- lapply(gmap_fem, function(a) a-min(a))
gmap_mal <- lapply(gmap_mal, function(a) a-min(a))
Finally, we bring them back together into a data frame.
liu_map <- cbind(map_list_to_df(pmap_anchored, pos_column="bp_build38"),
cM=map_list_to_df(gmap_ave, pos_column="cM")[,"cM"],
cM_female=map_list_to_df(gmap_ave, pos_column="cM_female")[,"cM_female"],
cM_male=map_list_to_df(gmap_ave, pos_column="cM_male")[,"cM_male"])
We also need to change the position names to use the build38 positions.
wh <- grep("_", liu_map$marker)
rownames(liu_map)[wh] <- liu_map$marker[wh] <- paste(liu_map$chr[wh], liu_map$bp_build38[wh], sep="_")
Let’s grab the physical and genetic maps as lists, for later use.
liu_pmap <- map_df_to_list(liu_map, pos_column="bp_build38")
liu_gmap <- map_df_to_list(liu_map, pos_column="cM")
Let’s now get the interpolated cM positions for the GigaMUGA markers. We first download the SNP map file and load it into R.
url <- "ftp://ftp.jax.org/MUGA/GM_snps.Rdata"
file <- file.path(in_dir, basename(url))
if(!file.exists(file)) {
download.file(url, file)
}
load(file)
Now we split the physical positions into a list, first subsetting to the autosomes and the X chromosome. Also, we convert the Mbp positions to basepairs.
GM_pmap <- map_df_to_list(GM_snps[GM_snps$chr %in% c(1:19,"X"),], pos_column="pos")
GM_pmap <- lapply(GM_pmap, function(a) round(a*1e6))
And now let’s get interplated cM positions using the Liu et al. map.
GM_gmap <- interp_map(GM_pmap, liu_pmap, liu_gmap)
Finally, we paste the cM positions back into the GM_snps
object.
GM_snps$cM[GM_snps$chr %in% c(1:19,"X")] <- map_list_to_df(GM_gmap, pos_column="cM")[,"cM"]
I’ll also make the chr
column a factor, with levels 1
, 2
, …, 19
, X
, Y
, and M
.
GM_snps$chr <- factor(GM_snps$chr, levels=c(1:19,"X", "Y", "M"))
Let’s do the same thing with the MegaMUGA markers. We first download the SNP map file and load it into R.
url <- "ftp://ftp.jax.org/MUGA/MM_snps.Rdata"
file <- file.path(in_dir, basename(url))
if(!file.exists(file)) {
download.file(url, file)
}
load(file)
Note that the MM_snps
object has markers on chr 1-19, X, Y, P, and 22. No missing values in the chr
column, and no notations for the mitochondria. But there are missing values in the pos
column.
We split the physical positions into a list, first subsetting to the autosomes and the X chromosome (and omitting snps missing physical positions). Also, we convert the Mbp positions to basepairs.
MM_keep <- !is.na(MM_snps$pos) & MM_snps$chr %in% c(1:19,"X")
MM_pmap <- map_df_to_list(MM_snps[MM_keep,], pos_column="pos")
MM_pmap <- lapply(MM_pmap, function(a) round(a*1e6))
And now let’s get interplated cM positions using the Liu et al. map.
MM_gmap <- interp_map(MM_pmap, liu_pmap, liu_gmap)
Finally, we paste the cM positions back into the MM_snps
object.
MM_snps$cM[MM_keep] <- map_list_to_df(MM_gmap, pos_column="cM")[,"cM"]
Let’s now calculate a grid along the genetic map, and find the corresponding bp positions. On each chromosome, I’ll have the grid start at 3 Mbp and go to the telomere. (Traditionally, the mouse chromosomes sequence all start at 3 Mbp, leaving space for the centromere.)
So first we need to add a position at 3 Mb for each chromosome and interpolate to get the corresponding cM location.
liu_pmap_3Mbp <- lapply(liu_pmap, function(a) sort(c(a, start=3e6)))
liu_gmap_3Mbp <- interp_map(liu_pmap_3Mbp, liu_pmap, liu_gmap)
Now we want to form a grid starting at the second position on each chromosome and ending at the telomere. KB Choi’s 64k grid is a nice round number, but the inter-marker distances are slightly different between chromosomes. I’d be inclined to go with a 0.02 cM grid, which would give 68,433 positions, or a 0.01 cM grid, which would give 136,854 positions.
Let’s go with a 0.02 cM grid for now.
grid_gmap <- lapply(liu_gmap_3Mbp, function(a) seq(a[2], a[length(a)], by=0.02))
And next get interpolated physical positions, rounding to basepairs.
grid_pmap <- interp_map(grid_gmap, liu_gmap_3Mbp, liu_pmap_3Mbp)
grid_pmap <- lapply(grid_pmap, round)
Now, we add names.
for(i in seq_along(grid_pmap)) {
names(grid_pmap[[i]]) <- names(grid_gmap[[i]]) <-
paste(names(grid_pmap)[i], grid_pmap[[i]], sep="_")
}
And we put them back together as a data frame.
grid <- cbind(map_list_to_df(grid_pmap, pos_column="bp"),
cM=map_list_to_df(grid_gmap, pos_column="cM")[,"cM"])
rownames(grid) <- grid$marker
grid$pos <- grid$bp/1e6
grid <- grid[,c("marker", "chr", "pos", "cM", "bp")]
The resulting grid
object has 68,433 rows. The columns are slightly different than the snps
object in the DO378_islet_v2.RData
file, in that the snps
object has a column called bp
that is actually Mbp and a column pos
that is basepairs. The new grid
object has the same content, but the third column with Mbp positions is called pos
and the final column with basepairs is called bp
.
We won’t download the islet RNA-seq data, because it’s not yet officially public, but we grabbed the file DO378_islet_v2.RData
from the Dan Gatti’s directory on the JAX ftp site.
file <- file.path(in_dir, "DO378_islet_v2.RData")
load(file)
The object snps
is the previous 64k grid with positions evenly spaced in cM. (Again note that the spacing is slightly different between chromosomes.)
The object we want is annot.mrna
. The chr
column has 1-19, X
, Y
, and MT
. There are columns start
, end
, and middle_point
, all in basepairs, with middle_point
being round((start+end)/2)
. The column nearest_snp
has integers in the range 1-64k, with missing values for the genes on the Y and MT; this is the index of the nearest point on the 64k SNP grid. We want to replace those values with the indices corresponding to our new grid.
Here’s a function to do the job for a single position:
find_nearest <-
function(chr, pos, map, index)
{
if(!(chr %in% names(map))) return(NA)
index[[chr]][ which.min( abs(pos - map[[chr]]) ) ]
}
Now we pull out the grid positions as a list and calculate a corresponding list of grid indices.
grid_pmap <- map_df_to_list(grid, pos_column="bp")
grid_index <- map_df_to_list(cbind(grid, index=1:nrow(grid)), pos_column="index")
Finally, we calculate the new grid indices and plug them into the object.
index <- rep(NA, nrow(annot.mrna))
for(i in 1:nrow(annot.mrna)) {
index[i] <- find_nearest(annot.mrna$chr[i], annot.mrna$middle_point[i],
grid_pmap, grid_index)
}
annot.mrna_newgrid <- annot.mrna
annot.mrna_newgrid$nearest_snp <- index
One potential issue to keep in mind: the Liu et al. map has some long regions with very low recombination, and so this new 0.02 cM grid has some pretty big physical gaps, particularly on chromosome 14. Across the whole genome, there are 82 gaps > 1 Mbp, and 11 gaps > 2 Mbp, and then chromosome 14 has two gaps > 5 Mbp.
In contrast, KB Choi’s original grid had 0 gaps > 2 Mbp and just 8 gaps > 1 Mbp.
One result of these physical gaps in the grid is that the many of the genes in annot.mrna
are rather far away from the closest grid position.
With the original grid, of the 21,771 genes on the autosomes or X chromosome, 0 are more than 1 Mbp away from a grid position, and just 8 are more than 500 kbp away from a grid position.
With the new grid, 36 are more than 1 Mbp away from a grid position, and 129 are more than 500 kbp away from a grid position. There are 7 genes more than 2 Mbp from a grid position, all on chr 14. The maximum distance is 2.8 Mbp.
To deal with the physical gaps between grid points, I’m inclined to add additional pseudomarkers so that no two points are more than 0.5 Mbp apart.
grid_pmap <- lapply(grid_pmap, function(a) a/1e6) # pmap in Mbp rather than bp
grid_pmap_plus <- insert_pseudomarkers(grid_pmap, step=0.5, stepwidth="max")
for(i in seq_along(grid_pmap_plus)) {
names(grid_pmap_plus[[i]]) <- paste(names(grid_pmap)[i], round(grid_pmap_plus[[i]]*1e6), sep="_")
}
grid_gmap_plus <- interp_map(grid_pmap_plus, grid_pmap, grid_gmap)
grid_plus <- cbind(map_list_to_df(grid_pmap_plus, pos_column="pos"),
cM=map_list_to_df(grid_gmap_plus, pos_column="cM")[,"cM"])
rownames(grid_plus) <- grid_plus$marker
grid_plus$bp <- round(grid_plus$pos*1e6)
grid_plus <- grid_plus[,c("marker", "chr", "pos", "cM", "bp")]
This increases the number of grid points from 68433 to 69005 (that is, an additional 572 points).
Now let’s save everything to files. I’m going to use .rds
files rather than .RData
files, as they each will contain one object.
saveRDS(liu_map, file.path(out_dir, "liu_map.rds"))
saveRDS(GM_snps, file.path(out_dir, "GM_snps_v2.rds"))
saveRDS(MM_snps, file.path(out_dir, "MM_snps_v2.rds"))
saveRDS(grid, file.path(out_dir, "grid_0.02cM.rds"))
saveRDS(grid_plus, file.path(out_dir, "grid_0.02cM_plus.rds"))
saveRDS(annot.mrna_newgrid, file.path(out_dir, "annot_mrna_0.02cM_grid.rds"))
devtools::session_info()
## setting value
## version R version 3.4.2 (2017-09-28)
## system x86_64, darwin15.6.0
## ui X11
## language en_US.UTF-8
## collate en_US.UTF-8
## tz America/Chicago
## date 2017-11-18
##
## package * version date source
## assertthat 0.2.0 2017-04-11 CRAN (R 3.4.0)
## backports 1.1.1 2017-09-25 CRAN (R 3.4.2)
## base * 3.4.2 2017-10-04 local
## broman * 0.66-2 2017-10-24 local
## compiler 3.4.2 2017-10-04 local
## data.table 1.10.4-3 2017-10-27 CRAN (R 3.4.2)
## datasets * 3.4.2 2017-10-04 local
## devtools 1.13.3 2017-08-02 CRAN (R 3.4.1)
## digest 0.6.12 2017-01-27 CRAN (R 3.4.0)
## evaluate 0.10.1 2017-06-24 CRAN (R 3.4.0)
## graphics * 3.4.2 2017-10-04 local
## grDevices * 3.4.2 2017-10-04 local
## htmltools 0.3.6 2017-04-28 CRAN (R 3.4.0)
## httr 1.3.1 2017-08-20 CRAN (R 3.4.1)
## knitr 1.17 2017-08-10 CRAN (R 3.4.1)
## magrittr 1.5 2014-11-22 CRAN (R 3.4.0)
## memoise 1.1.0 2017-04-21 CRAN (R 3.4.0)
## methods * 3.4.2 2017-10-04 local
## parallel 3.4.2 2017-10-04 local
## qtl 1.42-2 2017-09-13 local
## qtl2convert * 0.5-10 2017-11-18 local
## qtl2geno * 0.5-37 2017-11-18 local
## qtl2scan * 0.5-25 2017-11-18 local
## R6 2.2.2 2017-06-17 CRAN (R 3.4.0)
## Rcpp 0.12.13 2017-09-28 CRAN (R 3.4.2)
## rmarkdown 1.6 2017-06-15 CRAN (R 3.4.0)
## rprojroot 1.2 2017-01-16 CRAN (R 3.4.0)
## rvest * 0.3.2 2016-06-17 CRAN (R 3.4.0)
## selectr 0.3-1 2016-12-19 CRAN (R 3.4.0)
## stats * 3.4.2 2017-10-04 local
## stringi 1.1.5 2017-04-07 CRAN (R 3.4.0)
## stringr 1.2.0 2017-02-18 CRAN (R 3.4.0)
## tools 3.4.2 2017-10-04 local
## utils * 3.4.2 2017-10-04 local
## withr 2.0.0 2017-07-28 CRAN (R 3.4.1)
## XML 3.98-1.9 2017-06-19 CRAN (R 3.4.0)
## xml2 * 1.1.1 2017-01-24 CRAN (R 3.4.0)
## yaml 2.1.14 2016-11-12 CRAN (R 3.4.0)