# MIT License
#
# Copyright 2017 Broad Institute
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# predict the number of transcripts by projecting from the downsampling

#Questions to answer:
#1) Is this robust?  Project results using downsampling 0.1-0.5, then predict 0.6-1.  How close to the right answer are the results?
#2) Normalize #transcripts to fractions of 100% of the predicted data for each decile and plot
#3) Convert to fractional gain of transcripts at each step and plot fractional gains as %'s.  IE: going from 100 to 140 transcripts is 40% gain.  (xnew -x) / x



#downsampledData is a data.frame with the barcode as the rowname, and the columns are downsampled data at fractions of 1.
#molecularBarcodeDistributionByGeneFile="/downloads/test/PC1_20000_molBC.txt.gz"
#summaryColumn="num_trans";thresholdList=seq(0.1,1,0.1);estimatedNumCells=7500
#downsampledData=gatherSensitivityByDownsampling(transcriptFile=molecularBarcodeDistributionByGeneFile, summaryColumn, thresholdList, estimatedNumCells)
predictTranscriptsGainedPerRead<-function (downsampledData, quantileSequence=seq(0.1, 1, by=0.1), maxExpansion=2.5, excludeLargestSeries=F, organism=NULL) {
    
    #try this fit with deciles
    
    quantileTranscripts=apply(downsampledData, 2, function(x) quantile(x, quantileSequence))
    if (excludeLargestSeries) {
        quantileSequence=quantileSequence[-length(quantileSequence)]
        quantileTranscripts=quantileTranscripts[-dim(quantileTranscripts)[1],]
    }
    predictedData=apply(quantileTranscripts, 1, predictNumTranscripts, maxExpansion=maxExpansion)
    
    
    colors=rainbow(length(quantileSequence))
    xRange=range(as.numeric(rownames(predictedData)))
    yRange=c(0, max(predictedData))
    x=as.numeric(colnames(downsampledData))

    if (is.null(organism)) {
        titleSuffix = ""
    } else {
        titleSuffix = paste0(" (", organism, ")")
    }

    plot (xRange, yRange, xlim=xRange, ylim=yRange,
        xlab="Reads generated (relative to this run)", ylab="Transcripts per cell",
        type='n', main=paste0("Return to sequencing coverage (downsampling + projection)", titleSuffix))
    for (i in 1:(dim(quantileTranscripts)[1])) {
        qt=    quantileTranscripts[i,]
        points(x, qt, col=colors[i])
        lines(as.numeric(rownames(predictedData)), predictedData[,i], col=colors[i])
    }
    par(xpd=T)
    legend("topleft", legend=quantileSequence, fill=colors, title="Library Quantile", ncol=2)
    par(xpd=F)
    
}

downsampleCells<-function (transcriptFile, transcripts, barcodeList=NA, cellTypesFile=NA, organism="Human", thresholdList=seq(0.1,1,0.1)) {
	if (!is.null(transcriptFile)) {
		transcripts=read.table(transcriptFile, header=T, stringsAsFactors=F, sep="\t")
	}

	#if there's a cellTypesFile, use it to populate the barcodeList.
	if (!is.na(cellTypesFile)) {
		cellTypes=read.table(cellTypesFile, header=T, stringsAsFactors=F)
		barcodeList= cellTypes[cellTypes$organism==organism,]$tag
	}

	if (all(!is.na(barcodeList))) {
		idx=match(transcripts$Cell.Barcode, barcodeList)
		idx=which(!is.na(idx))
		transcripts = transcripts[idx,]
	}


	a=data.table(transcripts)
	rm(transcripts)

	downsampleRow<-function (x, fraction=0.5) {
		length(which(runif(x)<fraction))

	}

	#threshold=0.1
	downsampleData<-function (threshold, x) {
		r=x[,downsampleRow(Num_Obs, threshold), keyby=as.numeric(rownames(x))]
		r[,Gene:=x$Gene]
		r=r[r$V1!=0,]
		rr=list(downsampling_ratio=threshold, num_reads=sum(r$V1), num_transcripts=dim(r)[1], num_genes=length(unique(r$Gene)))
		return (rr)
	}


	#x=a[a$Cell.Barcode=="AAAAGGACAAAG", ]
	downsampleDataByCell<-function (x, thresholdList) {
		z=lapply(thresholdList, downsampleData, x)
		z=data.frame(do.call(rbind, z))
		return (z)
	}

	#debug(downsampleDataByCell)
	rr=a[,downsampleDataByCell(.SD, thresholdList),by=Cell.Barcode]
	rr=data.frame(barcode=rr$Cell.Barcode, ratio=unlist(rr[["downsampling_ratio"]]), num_reads=unlist(rr[["num_reads"]]), num_trans=unlist(rr[["num_transcripts"]]), num_genes=unlist(rr[["num_genes"]]), stringsAsFactors=F)
	rr=data.table(rr, key="barcode")

	return (rr)

}

gatherSensitivityByDownsampling<-function (transcriptFile, summaryColumn="num_genes", thresholdList=seq(0.1,1,0.1), estimatedNumCells=0) {
    #validate column names
    validColumnNames=c("num_genes", "num_trans")

    if (length(intersect(summaryColumn, validColumnNames))!=1) {
        warning(paste("Please use a valid summary column.  Given column [", summaryColumn, "] Valid choices: [", paste(validColumnNames, collapse=" "), "]", sep=""))
    }

    #read in transcripts and limit data to a subset of big cells with many transcripts, if the estimatedNumCells is not 0.
    if (estimatedNumCells==0) {
        rr= downsampleCells(transcriptFile=transcriptFile, transcripts=NULL, barcodeList=NA, cellTypesFile=NA, organism=NULL, thresholdList=thresholdList)
    } else {
        transcripts=read.table(transcriptFile, header=T, stringsAsFactors=F, sep="\t")
        td=data.table(transcripts, key="Cell.Barcode")
        getNumTranscriptsPerCell<-function (x) {
            dim (unique(x[,c("Gene", "Molecular_Barcode"),with=F]))[1]
        }
        transcriptsPerCell=td[, getNumTranscriptsPerCell(.SD), key=Cell.Barcode]
        transcriptsPerCell=transcriptsPerCell[order(transcriptsPerCell$V1,decreasing=T),]
        cellBC=head(transcriptsPerCell$Cell.Barcode, n=estimatedNumCells)
        #filter transcripts to the final list of cells.  This is the top <X> cells by #transcripts.
        idx=which(!is.na(match(transcripts$Cell.Barcode, cellBC)))
        transcripts=transcripts[idx,]
        rr= downsampleCells(transcriptFile=NULL, transcripts=transcripts, barcodeList=NA, cellTypesFile=NA, organism=NULL, thresholdList=thresholdList)
    }

	#More concise way to reshape the data.
	rrr=rr[,c("barcode", "ratio", summaryColumn), with=F]
	z=dcast(rrr, formula = barcode ~ ratio, value.var="num_trans")
	rownames(z)=z$barcode
	#z=z[,-1,with=F]
	z=z[,-1]
    return (z)
}

runTranscriptDownsampling<-function (molecularBarcodeDistributionByGeneFile, summaryColumn="num_trans", thresholdList=seq(0.1,1,0.1), estimatedNumCells=7500, outFile=NULL) {
    downsampledData=gatherSensitivityByDownsampling(transcriptFile=molecularBarcodeDistributionByGeneFile, summaryColumn=summaryColumn, thresholdList=thresholdList, estimatedNumCells=estimatedNumCells)
    if (!is.null(outFile)) write.table(downsampledData, outFile, row.names=F, col.names=T, quote=F, sep="\t")
    return (downsampledData)
}

predictNumTranscripts<-function (qt, maxExpansion=5) {
    ratios=as.numeric(names(qt))
    fit  <- lm(qt~log(ratios))
    xx=seq(min(ratios), max(ratios*maxExpansion), by=0.1)
    newY=predict(fit, data.frame(ratios=xx))
    names (newY)=xx
    return (newY)
}

testRobustness<-function (downsampledData) {
    quantileSequence=seq(0.1, 1, by=0.1)
    maxExpansion=5
    
    quantileTranscripts=apply(downsampledData, 2, function(x) quantile(x, quantileSequence))
    x=as.numeric(colnames(downsampledData))
    
    predictedData=apply(quantileTranscripts[,1:5], 1, predictNumTranscripts, maxExpansion=5)
    s=seq(0.6, 1, 0.1)
    idx=match(s, rownames(predictedData))
    p=predictedData[idx,]
    a=t(quantileTranscripts[,match(s, colnames(quantileTranscripts))])
    #calculate the difference between the predicted and actual data.
    #the difference between the predicted and actual, divided by the max of the two, as a %.
    diff=(p-a)/pmax(p,a)*100
    
    colors=rainbow(length(quantileSequence))
    xRange=range(as.numeric(rownames(predictedData)))
    yRange=c(0, max(predictedData))
    
    plot (xRange, yRange, xlim=xRange, ylim=yRange, xlab="fraction of reads", ylab="Number of transcripts", type='n')    
    for (i in 1:(dim(quantileTranscripts)[1])) {
        qt=    quantileTranscripts[i,]
        points(x, qt, col=colors[i])
        lines(as.numeric(rownames(predictedData)), predictedData[,i], col=colors[i])
    }
    par(xpd=T)
    legend("topleft", legend=quantileSequence, fill=colors, title="Library Quantile", ncol=2)
    par(xpd=F)
    
}

#plotAsPercent(downsampledData)
predictTranscriptsGainedPerReadAsPercent<-function (downsampledData, quantileSequence=seq(0.1, 1, by=0.1), maxExpansion=2.5, organism=NULL) {

    if (is.null(organism)) {
        titleSuffix = ""
    } else {
        titleSuffix = paste0(" (", organism, ")")
    }

    quantileTranscripts=apply(downsampledData, 2, function(x) quantile(x, quantileSequence))
    predictedData=apply(quantileTranscripts, 1, predictNumTranscripts, maxExpansion=maxExpansion)
    
    #convert to fraction of expression compared to full read depth.
    quantileTranscripts/quantileTranscripts[,10]
    
    predictedData=t(predictedData)
    predictedData=predictedData/predictedData[,10]
    
    colors=rainbow(length(quantileSequence))
    xRange=range(as.numeric(colnames(predictedData)))
    yRange=c(0, max(predictedData))
    x=as.numeric(colnames(downsampledData))
    plot (xRange, yRange, xlim=xRange, ylim=yRange,
        xlab="Reads generated (relative to this run)", ylab="Transcripts per cell (relative to this run)",
    type='n', main=paste0("Return to sequencing coverage (downsampling + projection)", titleSuffix))
    for (i in 1:(dim(quantileTranscripts)[1])) {
        qt=    quantileTranscripts[i,]
        points(x, qt, col=colors[i])
        lines(as.numeric(colnames(predictedData)), predictedData[i,], col=colors[i])
    }
    par(xpd=T)
    legend("topleft", legend=quantileSequence, fill=colors, title="Library Quantile", ncol=2)
    par(xpd=F)
}

#dotChartTranscripts(downsampledData)
dotChartTranscriptsGain<-function (downsampledData,
                quantileSequence=seq(0.1, 1, by=0.1),
                readMultiplier=c(0.5, 2, 10),
                organism="") {
    
    maxExpansion=max(readMultiplier)
    
    quantileTranscripts=apply(downsampledData, 2, function(x) quantile(x, quantileSequence))
    predictedData=apply(quantileTranscripts, 1, predictNumTranscripts, maxExpansion=maxExpansion)
    
    predictedData=t(predictedData)
    idx=which(quantileSequence==1)
    predictedData=predictedData/predictedData[,idx]
    
    d=predictedData[,match(readMultiplier, colnames(predictedData))]
    cols=rainbow(length(readMultiplier))
    colors=as.vector(sapply(cols, function (x, len) rep(x, len), length(quantileSequence)))
    
    multiplierStrings=paste(readMultiplier, "x", sep="")

    if (is.null(organism)) {organism = ""}

    dotchart(d, xlim=range(d), ylab="decile of library size", xlab="Transcripts per cell (relative to current run)", 
             main=paste(organism, "Return to sequencing coverage at\n", paste(multiplierStrings, collapse=", "), "the current coverage", sep=" "), col=colors, pch=16, gpch=0, gcolor="white")
    legend("topright", legend=paste(readMultiplier, "x", sep=""), fill=cols)
}

#downsampledDataFile="/downloads/test/PC1_20000_transcript_downsampling.txt"
#outFile="/downloads/test/PC1_20000_transcript_downsampling_deciles.txt"
generateTranscriptQuantileTable<-function (downsampledDataFile=NULL, outFile=NULL) {
    #TRY 2
    downsampledData=read.table(downsampledDataFile, header=T, stringsAsFactors=F, check.names=F)
    downsampledData=data.table(downsampledData)
    d=downsampledData[,dim(downsampledData)[2],with=F]
    setnames(d, "1", "value")
    d=d[order(d$value),]
    
    quantileSequence=c(0, 0.02, seq(0.05, 0.95, by=0.1), 0.98, 1)
    
    x=quantile(d$value, c(0, 0.01, seq(0.1, 0.9, by=0.1), 0.99, 1))
    
    d[,quartile:=cut(value,
                     breaks=quantile(value,probs=quantileSequence),
                     labels=1:(length(quantileSequence)-1),right=F)]
    d$quartile=as.numeric(d$quartile)
    d=d[order(d$value, descending=T),]
    #if there are very few cells, then things get slightly more wonky.
    
    getStats<-function (x) {
        xx=as.numeric(x)
        r=data.frame(min=min(xx), max=max(xx), median=median(xx), size=length(xx))
        return (r)
    }
    
    getMidPoints<-function (d) {
        quartiles=unique(d$quartile)
        getMP<-function(threshold, d) {
            (length(which(d$quartile<threshold))+(length(which(d$quartile==threshold))*0.5))/dim(d)[1]
        }
        r=sapply(quartiles, getMP, d)
        return (r)
    }

    getIndexOfDesiredQuantile<-function (dq, result) {
        which.min(abs(result$midPoint-dq))
    }
    
    result=d[,getStats(value),by=quartile]
    result$midPoint=getMidPoints(d)
    
    desiredQuantiles=c(0.01, seq(0.1, 0.9, by=0.1), 0.99)
    desiredQuantiles=c(0.01, 0.035, seq(0.1, 0.9, by=0.1), 0.965, 0.99)
    idx=sapply(desiredQuantiles, getIndexOfDesiredQuantile, result)    
    result=result[idx,]
    result$labels=desiredQuantiles
    
    #add smallest and largest cells.
    smallest=data.frame(quartile=0, min=min(d$value), max=min(d$value), median=min(d$value), size=1, midPoint=0, labels="Smallest_Cell", stringsAsFactors=F)
    largest=data.frame(quartile=max(result$quartile)+1, min=max(d$value), max=max(d$value), median=max(d$value), size=1, midPoint=1, labels="Largest_Cell", stringsAsFactors=F)
    result=rbind(smallest, result, largest)
    result[result$labels==0.01,]$size=result[result$labels==0.01,]$size-1
    result$cumsize=cumsum(result$size)
    
    result=result[,c("labels", "size", "cumsize", "min", "max", "median"),with=F]
    colnames(result)=c("Quantile", "bin_size", "cumulative_num_cells", "min_transcripts", "max_transcripts", "median_transcripts")
    if (!is.null(outFile)) write.table(result, outFile, row.names=F, col.names=T, quote=F, sep="\t")
    return (result)
    
    
}

#downsampledDataFile="/downloads/test/PC1_20000_transcript_downsampling.txt"
#outFile="/downloads/test/PC1_20000_transcript_downsampling_deciles.txt"
#downsampledDataFile="/downloads/test2/reports/N702_auto_transcript_downsampling.txt"
#outFile="/downloads/test2/N702_auto_transcript_downsampling_deciles.txt"
generateTranscriptQuantileTableSimple<-function (downsampledDataFile=NULL, outFile=NULL) {
    
    downsampledData=read.table(downsampledDataFile, header=T, stringsAsFactors=F, check.names=F)
    downsampledData=data.table(downsampledData)
    d=downsampledData[,dim(downsampledData)[2],with=F]
    setnames(d, "1", "value")
    d=d[order(d$value),]
    
    quantileSequence=c(0, 0.01, seq(0.1, 0.9, by=0.1), 0.99, 1)
    
    x=quantile(d$value, quantileSequence)
    quantilePosition=ceiling(dim (d)[1]*quantileSequence)
    #because 0 isn't a thing.
    quantilePosition[1]=1
    
    #reverse the 1-x ranking
    quantilePosition=quantilePosition[length(quantilePosition):1]
    
    result=data.frame(quantile=quantileSequence, num_cells=quantilePosition, median_transcripts=x, stringsAsFactors=F)
    result$labels=result$quantile
    result=result[order(result$quantile, decreasing=T),]
    result[1,]$labels="Largest_Cell"
    result[dim(result)[1],]$labels="Smallest_Cell"
    

    result=result[,c("labels", "num_cells", "median_transcripts"),]
    colnames(result)=c("quantile", "cumulative_num_cells", "median_transcripts")
    if (!is.null(outFile)) write.table(result, outFile, row.names=F, col.names=T, quote=F, sep="\t")
    return (result)
}

#decilesTableFile="/downloads/test/PC1_20000_transcript_downsampling_deciles.txt"
#decilesTableFile="/downloads/test2/N702_auto_transcript_downsampling_deciles.txt"
plotDecilesTable<-function (decilesTableFile, organism=NULL) {
    
    result=read.table(decilesTableFile, header=T, stringsAsFactors=F, sep="\t")
    result$cumulative_num_cells=prettyNum(result$cumulative_num_cells,big.mark=",",scientific=FALSE)
    result$median_transcripts=prettyNum(round(result$median_transcripts),big.mark=",",scientific=FALSE)
    
    colnames(result)=c("Quantile", "#Cells Total", "# Transcripts")
    
    idx=which(result$Quantile=="Smallest_Cell")
    result[idx,]$Quantile="Smallest Cell"
    idx=which(result$Quantile=="Largest_Cell")
    result[idx,]$Quantile="Largest Cell"
    
    z=tableGrob(result, rows=NULL)
    if (!is.null(organism)) {
        # c.f. http://stackoverflow.com/questions/31640916/how-can-i-add-a-title-to-a-tablegrob-plot
        title <- textGrob(organism,gp=gpar(fontsize=20))
        padding <- unit(5,"mm")

        table <- gtable_add_rows(z,
                heights = grobHeight(title) + padding,
                pos = 0)
        z <- gtable_add_grob(table, title, 1, 1, 1, ncol(table))

    }
    grid.newpage()
    grid.draw(z)
}
