# solutions: R problem set for 140.778
#
# Part A

# load data
expr <- read.table("expr.csv",sep=",",header=T,na.strings=c(".","-99"))
biol <- read.table("biol.csv",sep=",",header=T,na.strings=c(".","-99"))

# matrix, list or data.frame?
is.matrix(expr);is.list(expr);is.data.frame(expr)
is.matrix(biol);is.list(biol);is.data.frame(biol)

# columns = numeric or factor (or both)?
sapply(biol,mode)
sapply(biol,is.factor)
sapply(biol,is.numeric)

sapply(expr,mode)
sapply(expr,is.factor)
sapply(expr,is.numeric)

# find rows in biol with at least one NA
biol[apply(biol,1,function(a) any(is.na(a))),]

# find means of each column in expr
sapply(expr,mean,na.rm=T)

# make sample number the row names
dimnames(expr)[[1]] <- as.character(expr$sample)
expr <- expr[,-1]
sapply(expr,mean,na.rm=T)
sapply(expr,sd,na.rm=T)
sapply(expr,range,na.rm=T)

# correlation matrix
round(cor(expr,use="complete.obs"),2)
round(cor(expr,use="pairwise.complete.obs"),2)

# subtract mean of first two columns from each of the other columns
expr <- expr[,-(1:2)] - apply(expr[,1:2],1,mean,na.rm=T)

# scatterplot matrix
pairs(expr)

# fix up the biol data.frame
biol$f3 <- factor(substring(biol$sample,1,1))
dimnames(biol)[[1]] <- substring(biol$sample,2)
biol <- biol[,-1]

# check that the rownames for biol and expr are the same, just in different orders
all(sort(dimnames(biol)[[1]]) == sort(dimnames(expr)[[1]]))

# sort the rows in biol and expr by their rownames
biol <- biol[sort(dimnames(biol)[[1]]),]
expr <- expr[sort(dimnames(expr)[[1]]),]

# find mean & SD of each column in expr 
apply(expr,2,function(a,b)
      tapply(a,b,function(x) c(mean(x,na.rm=T), sd(x,na.rm=T))),biol$f1)

# t.test comparing groups defined by biol$f2 for each column of expr
library(ctest)
apply(expr,2,function(a,b) {
  x <- split(a,b)
  t.test(x[[1]],x[[2]])$p.value },
      biol$f2)