# MIT License
#
# Copyright 2017 Broad Institute
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

downsampledBarcodes<-function (cellTypesFile, cellBarcodes=NULL, numCells, numBases=1) {
	if (is.null(cellBarcodes)) {
		a=read.table(cellTypesFile, header=T, stringsAsFactors=F, sep="\t")
		a=a[order(a$total, decreasing=T),]
		a=head (a, numCells)
		
	} else {	
		cells =head(cellBarcodes, numCells)
		a=data.frame(tag=cells, HUMAN=NA, MOUSE=NA, total=length(cells):1, ratio=1, organism="HUMAN", stringsAsFactors=F)
	}
	
	
	numOriginalBarcodes=dim(a)[1]
	
	statsOriginal=gatherStats(a, a, numOriginalBarcodes)
	
	#for apply
	getStats<-function (numBases, trimStartofBases, a, numOriginalBarcodes) {
		b= clipBarcodes(a, numBases, trimStartofBases)
		b= collapseBarcodes(b)
		stats=gatherStats(b, a, numOriginalBarcodes)	
		return(stats)
	}
	
	allStats=lapply(1:11, getStats, trimStartofBases=T, a, numOriginalBarcodes)
	allStats=c(list(statsOriginal), allStats)
	allStats=do.call(rbind, allStats)
	
	allStats2=lapply(1:11, getStats, trimStartofBases=F, a, numOriginalBarcodes)
	allStats2=c(list(statsOriginal), allStats2)
	allStats2=do.call(rbind, allStats2)
	
		
	myNums=sapply(12:1, getFractionRepeatedBarcodes, numCells, numIterations=10, bases=c("A", "C", "G", "T"))
	myNums=data.frame(num_bases=12:1, fraction=myNums)
	
		
	allStats$trimmedBases=max(allStats$bcLength)-allStats$bcLength
	allStats$simulation=myNums$fraction*100
	allStats2$trimmedBases=max(allStats2$bcLength)-allStats$bcLength
	allStats2$simulation=myNums$fraction*100
	
	result=list(startTrimmed=allStats, endTrimmed=allStats2)
	
	return (result)
	#plotD(allStats2, 5, numCells)
	
}

getFractionRepeatedBarcodes<-function (barcodeLength=12, numSequences, numIterations, bases=c("A", "C", "G", "T")) {

	
	
	runReplicate<-function (numSequences, barcodeLength, bases) {
		barcodes=replicate(numSequences, generateRandomBarcodeBCDoublets(len=barcodeLength, bases))		
		numUnique=length(unique(barcodes))
		facUnique= numUnique/numSequences
		return (facUnique)
	}
	
	replicates=replicate(numIterations, runReplicate(numSequences, barcodeLength, bases))
	result=mean(replicates)
	
}

generateRandomBarcodeBCDoublets<-function(len=12, bases=c("A", "C", "G", "T")) {
	paste(sample(bases, len, replace=T),collapse="")
}


gatherStats<-function (b, a, numOriginalBarcodes) {
	pct=dim(b)[1]/dim(a)[1]*100
	bcLength=nchar(b[1,]$tag)
	numOrgSpecificBarcodes=length(which(b$organism=="Human" | b$organism=="Mouse"))
	
	r=data.frame(bcLength=bcLength, numBarcodes=dim(b)[1], pctBarcodes=pct, numOrgSpecificBarcodes= numOrgSpecificBarcodes, pctOrgSpecificBarcodes=(numOrgSpecificBarcodes/numOriginalBarcodes)*100)
	return (r)
}


#x=allStats2
plotD<-function (x, minBCLength=5, numCells) {
	x=x[x$bcLength>= minBCLength,]
	minY=min(x$pctBarcodes, x$simulation)
	plot(range(x$trimmedBases), c((minY),100), type='n', xlab="Number of Trimmed Bases", ylab="Percent of BCs Remaining", axes=T)
	points(x$trimmedBases, x$simulation, col="green", type='l', lwd=2)
	points(x$trimmedBases, x$pctBarcodes, col="blue", type='l', lwd=2)
	legend("bottomleft", legend=c(paste(numCells, "cell barcodes"), paste(numCells, "random 12-mers")), fill=c("blue", "green"))
	
	
}

#this takes barcodes that are the same and sums the #HUMAN reads, #MOUSE reads, #total reads, and ratio
collapseBarcodes<-function (b) {
	b=data.table(b)
	
	#x=b[b$tag=="TTTTCCG",]
	mergeData<-function (x) {
		# if there's 1 entry, return it.
		#if (dim (x)[1]==1) return (x)
		z=data.frame(HUMAN=sum(x$HUMAN), MOUSE=sum(x$MOUSE), total=sum(x$total), ratio=sum(x$HUMAN)/sum(x$total), organism=paste(unique(x$organism),collapse=":"), numOriginalBarcodes=dim(x)[1], stringsAsFactors=F)
		return (z)
	}
	
	r=b[,mergeData(.SD),by="tag"]
	return (r)
}


clipBarcodes<-function (a, numBases, trimStartofBases=T) {
	#make a copy.
	b=a
	b$tag=sapply(b$tag, clipBarcode, numBases, trimStartofBases)
	return (b)
}

clipBarcode<-function (x, numBases, trimStartofBases=T) {
	if (trimStartofBases) {
		return (substr(x, numBases+1, nchar(x)))
	} else {
		return (substr(x, 1, nchar(x)-numBases))
	}
}
