Stromal prognosis analysis for Aedin

Load data:

library(curatedOvarianData)
library(gdata)
library(affy)
library(metafor)
library(xtable)
library(survival)
x <- read.xls("../input/Supplementary Table 1 - Tumor and stromal genes.xlsx", 
    as.is = TRUE)
head(x)

Create a list of ExpressionSets from curatedOvarianData:

dir.create("../output/data", recursive = TRUE)
source("../input/patientselection.config")
source("createEsetList.R")
## 2013-11-27 12:06:45 INFO::Inside script createEsetList.R - inputArgs =
## 2013-11-27 12:06:45 INFO::
## 2013-11-27 12:06:45 INFO::Loading curatedOvarianData 1.0.5
## 2013-11-27 12:07:16 INFO::Removing  PMID17290060_eset  (remove.datasets)
## 2013-11-27 12:07:16 INFO::Clean up the esets.
## 2013-11-27 12:07:16 INFO::including E.MTAB.386_eset
## 2013-11-27 12:07:16 INFO::excluding GSE12418_eset (min.number.of.events or min.sample.size)
## 2013-11-27 12:07:16 INFO::excluding GSE12470_eset (min.number.of.events or min.sample.size)
## 2013-11-27 12:07:17 INFO::including GSE13876_eset
## 2013-11-27 12:07:17 INFO::excluding GSE14764_eset (min.number.of.events or min.sample.size)
## 2013-11-27 12:07:17 INFO::including GSE17260_eset
## 2013-11-27 12:07:17 INFO::including GSE18520_eset
## 2013-11-27 12:07:17 INFO::excluding GSE19829.GPL570_eset (min.number.of.events or min.sample.size)
## 2013-11-27 12:07:17 INFO::including GSE19829.GPL8300_eset
## 2013-11-27 12:07:18 INFO::excluding GSE20565_eset (min.number.of.events or min.sample.size)
## 2013-11-27 12:07:18 INFO::excluding GSE2109_eset (min.number.of.events or min.sample.size)
## 2013-11-27 12:07:18 INFO::including GSE26712_eset
## 2013-11-27 12:07:18 INFO::excluding GSE30009_eset (min.number.of.genes)
## 2013-11-27 12:07:18 INFO::excluding GSE30161_eset (min.number.of.events or min.sample.size)
## 2013-11-27 12:07:19 INFO::including GSE32062.GPL6480_eset
## 2013-11-27 12:07:19 INFO::excluding GSE32063_eset (min.number.of.events or min.sample.size)
## 2013-11-27 12:07:19 INFO::excluding GSE6008_eset (min.number.of.events or min.sample.size)
## 2013-11-27 12:07:19 INFO::excluding GSE6822_eset (min.number.of.events or min.sample.size)
## 2013-11-27 12:07:19 INFO::including GSE9891_eset
## 2013-11-27 12:07:19 INFO::excluding PMID15897565_eset (min.number.of.events or min.sample.size)
## 2013-11-27 12:07:19 INFO::excluding PMID19318476_eset (min.number.of.events or min.sample.size)
## 2013-11-27 12:07:20 INFO::including TCGA_eset
## 2013-11-27 12:07:20 INFO::excluding TCGA.mirna.8x15kv2_eset (min.number.of.genes)
## 2013-11-27 12:07:21 INFO::Ids with missing data:

Invisibly convert dataset names to more human readable ones here…

Define the forestplot function:

forestplot <- function(esets, y = "y", probeset, formula = y ~ probeset, mlab = "Overall", 
    rma.method = "FE", at = NULL, xlab = "Hazard Ratio", ...) {
    require(metafor)
    esets <- esets[sapply(esets, function(x) probeset %in% featureNames(x))]
    coefs <- sapply(1:length(esets), function(i) {
        tmp <- as(phenoData(esets[[i]]), "data.frame")
        tmp$y <- esets[[i]][[y]]
        tmp$probeset <- exprs(esets[[i]])[probeset, ]

        summary(coxph(formula, data = tmp))$coefficients[1, c(1, 3)]
    })

    res.rma <- metafor::rma(yi = coefs[1, ], sei = coefs[2, ], method = rma.method)

    if (is.null(at)) 
        at <- log(c(0.25, 1, 4, 20))
    forest.rma(res.rma, xlab = xlab, slab = gsub("_eset$", "", names(esets)), 
        atransf = exp, at = at, mlab = mlab, ...)
    return(res.rma)
}

Create a simple average stromal content score for each patient. After scaling each gene to z score, this is the simple mean expression of stromal genes minus the mean expression of tumor genes.

## Create random assignments of stromal and tumor genes:
set.seed(10)
tmp <- as.character(c(x$Stromal.gene.symbols, x$Tumor.gene.symbols))
tmp <- tmp[tmp != ""]
random.stromal.genes <- sample(tmp, size = round(length(tmp)/2))
random.tumor.genes <- tmp[!tmp %in% random.stromal.genes]
esets.small <- esets
for (i in 1:length(esets)) {
    expr.stromal <- exprs(esets[[i]][featureNames(esets[[i]]) %in% x$Stromal.gene.symbols, 
        ])
    expr.tumor <- exprs(esets[[i]][featureNames(esets[[i]]) %in% x$Tumor.gene.symbols, 
        ])
    random.expr.stromal <- exprs(esets[[i]][featureNames(esets[[i]]) %in% random.stromal.genes, 
        ])
    random.expr.tumor <- exprs(esets[[i]][featureNames(esets[[i]]) %in% random.tumor.genes, 
        ])
    loginfo(paste(c(names(esets)[i], ": ", nrow(expr.stromal), " stromal, ", 
        nrow(expr.tumor), " tumor genes."), collapse = ""))
    esets.small[[i]] <- esets[[i]][1:3, ]
    featureNames(esets.small[[i]]) <- c("stromal.scaled", "stromal.unscaled", 
        "stromal.random")
    exprs(esets.small[[i]])[1, ] <- scale(colSums(t(scale(t(expr.stromal)))) - 
        colSums(t(scale(t(expr.tumor)))))
    exprs(esets.small[[i]])[2, ] <- scale(colSums(expr.stromal) - colSums(expr.tumor))
    exprs(esets.small[[i]])[3, ] <- scale(colSums(t(scale(t(random.expr.stromal)))) - 
        colSums(t(scale(t(random.expr.tumor)))))
}

Forest plots of Cox coefficient for stromal score

Stromal score is defined as the sum of expressions of each stromal gene, minus the sum of expressions of each tumor gene. This sum is then scaled to Z score to provide a consistent scale on the forest plots.

With each gene scaled to Z score

res <- forestplot(esets = esets.small, probeset = "stromal.scaled", at = log(c(0.5, 
    1, 2, 4)))

plot of chunk unnamed-chunk-3

No per-gene scaling is done.

Very consistent with above.

res <- forestplot(esets = esets.small, probeset = "stromal.unscaled", at = log(c(0.5, 
    1, 2, 4)))

plot of chunk unnamed-chunk-4

Genes are randomly assigned as stromal or tumor. Per-gene scaling is done.

Half of tumor / stroma genes are assigned as tumor, the other half as stroma. Likely there is substantial correlation between these genes, and I think that the confidence interval is likely anti-conservative: ie, there can be a lot of fluctuation of this interval for different random seeds. Probably the most cautious analysis would be to simulate thousands of random assignments of tumor / stromal genes, calculated the synthesized HR each time, and use this as a null distribution to obtain a p-value for the prognostic value of the true tumor / stromal assignments.

res <- forestplot(esets = esets.small, probeset = "stromal.random", at = log(c(0.5, 
    1, 2, 4)))

plot of chunk unnamed-chunk-5

Dataset properties

Table 1: Datasets used in the figure. Stage column provides early/late/unknown, histology column provides ser/clearcell/endo/mucinous/other/unknown.
PMID N samples stage histology Platform
Bentink 22348002 127 0/127/0 127/0/0/0/0/0/0 Illumina humanRef-8 v2.0
Crijns 19192944 98 0/98/0 98/0/0/0/0/0/0 Operon v3 two-color
Yoshihara 2010 20300634 43 0/43/0 43/0/0/0/0/0/0 Agilent G4112A
Mok 19962670 53 0/53/0 53/0/0/0/0/0/0 Affymetrix HG-U133Plus2
Konstantinopoulos 20547991 42 0/0/42 0/0/0/0/0/0/42 Affymetrix HG_U95Av2
Bonome 18593951 185 0/185/0 185/0/0/0/0/0/0 Affymetrix HG-U133A
Yoshihara 2012A 22241791 91 0/91/0 91/0/0/0/0/0/0 Agilent G4112F
Tothill 18698038 140 0/140/0 140/0/0/0/0/0/0 Affymetrix HG-U133Plus2
TCGA 21720365 413 0/410/3 413/0/0/0/0/0/0 Affymetrix HT_HG-U133A