/*
 * Decompiled with CFR 0.152.
 */
package org.broadinstitute.dropseqrna.annotation;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.Locatable;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.OverlapDetector;
import htsjdk.samtools.util.SequenceUtil;
import java.io.File;
import java.io.PrintStream;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.builder.EqualsBuilder;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.broadinstitute.dropseqrna.annotation.GQuadruplex;
import org.broadinstitute.dropseqrna.annotation.GeneAnnotationReader;
import org.broadinstitute.dropseqrna.cmdline.MetaData;
import org.broadinstitute.dropseqrna.utils.FastaSequenceFileWriter;
import org.broadinstitute.dropseqrna.utils.io.ErrorCheckingPrintStream;
import org.broadinstitute.dropseqrna.utils.referencetools.ReferenceUtils;
import picard.annotation.Gene;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;

@CommandLineProgramProperties(usage="Given a GTF file and a reference sequence, produce a report containing the %GC and length of each gene.  GC is calculated for each gene by finding the unique set of base positions overlapping an exon and counting [G/C] bases compared to the total number of bases.Length is calculated by computing the length of each transcript for the gene, and taking the median value", usageShort="Calculate GC content and length for genes", programGroup=MetaData.class)
public class GatherGeneGCLength
extends CommandLineProgram {
    private static final Log log = Log.getInstance(GatherGeneGCLength.class);
    @Option(doc="The GTF file containing gene models to generate length and GC metrics from")
    public File GTF;
    @Option(shortName="O", doc="The output report containg the genes and GC/Length metrics.  Output at the gene level, using the median values across transcripts.")
    public File OUTPUT;
    @Option(doc="The output report containg the genes and GC/Length metrics at the transcript level.", optional=true)
    public File OUTPUT_TRANSCRIPT_LEVEL;
    @Option(doc="The sequences of each transcript", optional=true)
    public File OUTPUT_TRANSCRIPT_SEQUENCES;
    @Option(shortName="R", doc="The reference fasta")
    public File REFERENCE;
    private DescriptiveStatistics stats = new DescriptiveStatistics();
    private DecimalFormat percentageFormat = new DecimalFormat("###.#");

    protected int doWork() {
        ReferenceSequenceFileWalker refFileWalker;
        SAMSequenceDictionary dict;
        IOUtil.assertFileIsReadable((File)this.GTF);
        IOUtil.assertFileIsWritable((File)this.OUTPUT);
        ErrorCheckingPrintStream out = new ErrorCheckingPrintStream(IOUtil.openFileForWriting((File)this.OUTPUT));
        this.writeHeader(out);
        ErrorCheckingPrintStream outTranscript = null;
        if (this.OUTPUT_TRANSCRIPT_LEVEL != null) {
            outTranscript = new ErrorCheckingPrintStream(IOUtil.openFileForWriting((File)this.OUTPUT_TRANSCRIPT_LEVEL));
            this.writeHeaderTranscript(outTranscript);
        }
        FastaSequenceFileWriter outSequence = null;
        if (this.OUTPUT_TRANSCRIPT_SEQUENCES != null) {
            IOUtil.assertFileIsWritable((File)this.OUTPUT_TRANSCRIPT_SEQUENCES);
            outSequence = new FastaSequenceFileWriter(this.OUTPUT_TRANSCRIPT_SEQUENCES);
        }
        if ((dict = (refFileWalker = new ReferenceSequenceFileWalker(this.REFERENCE)).getSequenceDictionary()) == null) {
            CloserUtil.close((Object)refFileWalker);
            throw new IllegalArgumentException("Reference file" + this.REFERENCE.getAbsolutePath() + " is missing a dictionary file [.dict].  Please make one!");
        }
        OverlapDetector<Gene> geneOverlapDetector = GeneAnnotationReader.loadAnnotationsFile(this.GTF, dict);
        List records = dict.getSequences();
        for (SAMSequenceRecord record : records) {
            String seqName = record.getSequenceName();
            int seqIndex = dict.getSequenceIndex(seqName);
            ReferenceSequence fastaRef = refFileWalker.get(seqIndex);
            Interval i = new Interval(seqName, 1, record.getSequenceLength());
            Set genes = geneOverlapDetector.getOverlaps((Locatable)i);
            for (Gene g : genes) {
                List<GCResult> gcList = this.calculateGCContentGene(g, fastaRef, dict);
                if (this.OUTPUT_TRANSCRIPT_LEVEL != null) {
                    this.writeResultTranscript(gcList, (PrintStream)outTranscript);
                }
                GCIsoformSummary summary = new GCIsoformSummary(g, gcList);
                if (this.OUTPUT_TRANSCRIPT_SEQUENCES != null) {
                    this.writeTranscriptSequence(g, fastaRef, dict, outSequence);
                }
                GCResult gc = this.calculateGCContentUnionExons(g, fastaRef, dict);
                this.writeResult(gc, summary, out);
            }
        }
        CloserUtil.close((Object)refFileWalker);
        CloserUtil.close((Object)out);
        if (this.OUTPUT_TRANSCRIPT_LEVEL != null) {
            CloserUtil.close((Object)outTranscript);
        }
        if (this.OUTPUT_TRANSCRIPT_SEQUENCES != null) {
            CloserUtil.close((Object)outSequence);
        }
        return 0;
    }

    private GCResult calculateGCContentUnionExons(Gene gene, ReferenceSequence fastaRef, SAMSequenceDictionary dict) {
        SAMFileHeader h = new SAMFileHeader();
        h.setSequenceDictionary(dict);
        h.setSortOrder(SAMFileHeader.SortOrder.unsorted);
        IntervalList intervalList = new IntervalList(h);
        for (Gene.Transcript t : gene) {
            for (Gene.Transcript.Exon e : t.exons) {
                intervalList.add(new Interval(gene.getContig(), e.start, e.end, gene.isNegativeStrand(), gene.getName()));
            }
        }
        List uniqueIntervals = IntervalList.getUniqueIntervals((IntervalList)intervalList, (boolean)false);
        GCResult result = new GCResult(0, 0, 0);
        for (Interval i : uniqueIntervals) {
            GCResult gcResultInterval = this.calculateGCContentExon(i, fastaRef);
            result.increment(gcResultInterval);
        }
        return result;
    }

    private List<GCResult> calculateGCContentGene(Gene gene, ReferenceSequence fastaRef, SAMSequenceDictionary dict) {
        ArrayList<GCResult> result = new ArrayList<GCResult>();
        for (Gene.Transcript t : gene) {
            String seq = this.getTranscriptSequence(t, fastaRef, dict);
            GCResult gc = new GCResult(seq);
            gc.setTranscript(t);
            List<GQuadruplex> gq = GQuadruplex.find(t.name, seq);
            gc.incrementGQuadruplexCount(gq.size());
            result.add(gc);
        }
        return result;
    }

    public static double getMedian(double[] data) {
        Arrays.sort(data);
        int numTranscripts = data.length;
        int middle = numTranscripts / 2;
        if (numTranscripts % 2 == 1) {
            double result = data[middle];
            return result;
        }
        double result = (data[middle - 1] + data[middle]) / 2.0;
        return result;
    }

    public void writeTranscriptSequence(Gene gene, ReferenceSequence fastaRef, SAMSequenceDictionary dict, FastaSequenceFileWriter outSequence) {
        for (Gene.Transcript t : gene) {
            String seqName = gene.getName() + " " + t.name;
            String sequence = this.getTranscriptSequence(t, fastaRef, dict);
            outSequence.writeSequence(seqName, sequence);
        }
    }

    public String getTranscriptSequence(Gene.Transcript transcript, ReferenceSequence fastaRef, SAMSequenceDictionary dict) {
        StringBuilder b = new StringBuilder();
        for (Gene.Transcript.Exon e : transcript.exons) {
            Interval i = new Interval(transcript.getGene().getContig(), e.start, e.end, transcript.getGene().isNegativeStrand(), transcript.getGene().getName());
            String seq = ReferenceUtils.getSequence(fastaRef.getBases(), i);
            b.append(seq);
        }
        String finalSeq = b.toString();
        finalSeq = finalSeq.toUpperCase();
        if (transcript.getGene().isNegativeStrand()) {
            finalSeq = SequenceUtil.reverseComplement((String)finalSeq);
        }
        return finalSeq;
    }

    private GCResult calculateGCContentExon(Interval interval, ReferenceSequence fastaRef) {
        String seq = ReferenceUtils.getSequence(fastaRef.getBases(), interval);
        if (interval.isNegativeStrand()) {
            seq = SequenceUtil.reverseComplement((String)seq);
        }
        GCResult result = new GCResult(seq);
        return result;
    }

    private void writeHeader(PrintStream out) {
        Object[] header = new String[]{"GENE", "CHR", "START", "END", "PCT_GC_UNIQUE_EXON_BASES", "PCT_GC_ISOFORM_AVERAGE", "PCT_C_ISOFORM_AVERAGE", "PCT_G_ISOFORM_AVERAGE", "MEDIAN_TRANSCRIPT_LENGTH", "NUM_TRANSCRIPTS", "MEDIAN_GQUADRUPLEXES"};
        String h = StringUtils.join((Object[])header, (String)"\t");
        out.println(h);
    }

    private void writeResult(GCResult gc, GCIsoformSummary summary, PrintStream out) {
        Gene gene = summary.getGene();
        Object[] line = new String[]{gene.getName(), gene.getContig(), Integer.toString(gene.getStart()), Integer.toString(gene.getEnd()), this.percentageFormat.format(gc.getGCPercent()), this.percentageFormat.format(summary.getMedianGC()), this.percentageFormat.format(summary.getMedianC()), this.percentageFormat.format(summary.getMedianG()), Integer.toString(summary.getMedianTranscriptLength()), Integer.toString(summary.getNumTranscripts()), Integer.toString(summary.getMedianGQuadruplexes())};
        String h = StringUtils.join((Object[])line, (String)"\t");
        out.println(h);
    }

    private void writeHeaderTranscript(PrintStream out) {
        Object[] header = new String[]{"TRANSCRIPT", "CHR", "START", "END", "PCT_GC", "PCT_C", "PCT_G", "TRANSCRIPT_LENGTH", "NUM_GQUADRUPLEXES"};
        String h = StringUtils.join((Object[])header, (String)"\t");
        out.println(h);
    }

    public void writeResultTranscript(List<GCResult> gcList, PrintStream out) {
        for (GCResult gc : gcList) {
            this.writeResultTranscript(gc, out);
        }
    }

    public void writeResultTranscript(GCResult gc, PrintStream out) {
        Object[] line = new String[]{gc.getTranscript().name, gc.getTranscript().getGene().getContig(), Integer.toString(gc.getTranscript().start()), Integer.toString(gc.getTranscript().end()), this.percentageFormat.format(gc.getGCPercent()), this.percentageFormat.format(gc.getCPercent()), this.percentageFormat.format(gc.getGPercent()), Integer.toString(gc.getRegionLength()), Integer.toString(gc.getNumGQuadruplexesObserved())};
        String h = StringUtils.join((Object[])line, (String)"\t");
        out.println(h);
    }

    public static void main(String[] args) {
        System.exit(new GatherGeneGCLength().instanceMain(args));
    }

    public class GCResult {
        private int regionLength = 0;
        private Gene.Transcript transcript = null;
        private int gCount = 0;
        private int cCount = 0;
        private int numGQuadruplexesObserved = 0;

        public GCResult(String sequence) {
            String seq = sequence.toUpperCase();
            char[] seqArray = seq.toCharArray();
            this.regionLength = seqArray.length;
            for (char c : seqArray) {
                if (c == 'C') {
                    this.incrementC(1);
                }
                if (c != 'G') continue;
                this.incrementG(1);
            }
        }

        public GCResult(int regionLength, int cCount, int gCount) {
            this.regionLength = regionLength;
            this.cCount += cCount;
            this.gCount += gCount;
        }

        public int getRegionLength() {
            return this.regionLength;
        }

        public int getGcCount() {
            return this.gCount + this.cCount;
        }

        public double getGCPercent() {
            double result = (double)this.getGcCount() / (double)this.regionLength * 100.0;
            return result;
        }

        public double getCPercent() {
            double result = (double)this.cCount / (double)this.regionLength * 100.0;
            return result;
        }

        public double getGPercent() {
            double result = (double)this.gCount / (double)this.regionLength * 100.0;
            return result;
        }

        public void incrementRegionLength(int length) {
            this.regionLength += length;
        }

        public void incrementG(int count) {
            this.gCount += count;
        }

        public void incrementC(int count) {
            this.cCount += count;
        }

        public void increment(GCResult other) {
            this.cCount += other.cCount;
            this.gCount += other.gCount;
            this.regionLength += other.regionLength;
        }

        public void incrementGQuadruplexCount(int count) {
            this.numGQuadruplexesObserved += count;
        }

        public int getNumGQuadruplexesObserved() {
            return this.numGQuadruplexesObserved;
        }

        public Gene.Transcript getTranscript() {
            return this.transcript;
        }

        public void setTranscript(Gene.Transcript transcript) {
            this.transcript = transcript;
        }

        public boolean equals(Object obj) {
            if (obj == null) {
                return false;
            }
            if (obj == this) {
                return true;
            }
            if (obj.getClass() != this.getClass()) {
                return false;
            }
            GCResult rhs = (GCResult)obj;
            return new EqualsBuilder().appendSuper(super.equals(obj)).append(this.gCount, rhs.gCount).append(this.cCount, rhs.cCount).append(this.regionLength, rhs.regionLength).append(this.numGQuadruplexesObserved, rhs.numGQuadruplexesObserved).isEquals();
        }

        public String toString() {
            return "Length [" + this.regionLength + "] %G [" + GatherGeneGCLength.this.percentageFormat.format(this.getGPercent()) + "] %C [" + GatherGeneGCLength.this.percentageFormat.format(this.getCPercent()) + "] %GC [" + GatherGeneGCLength.this.percentageFormat.format(this.getGCPercent()) + "]G-Quadruplexes [" + this.numGQuadruplexesObserved + "]";
        }
    }

    public class GCIsoformSummary {
        private List<GCResult> transcriptGCList;
        private Gene gene;

        public GCIsoformSummary(Gene gene, List<GCResult> transcriptGCList) {
            this.transcriptGCList = transcriptGCList;
            this.gene = gene;
        }

        public double getMedianGC() {
            double result = GatherGeneGCLength.getMedian(this.transcriptGCList.stream().mapToDouble(GCResult::getGCPercent).toArray());
            return result;
        }

        public double getMedianG() {
            double result = GatherGeneGCLength.getMedian(this.transcriptGCList.stream().mapToDouble(GCResult::getGPercent).toArray());
            return result;
        }

        public double getMedianC() {
            double result = GatherGeneGCLength.getMedian(this.transcriptGCList.stream().mapToDouble(GCResult::getCPercent).toArray());
            return result;
        }

        public int getMedianTranscriptLength() {
            double result = GatherGeneGCLength.getMedian(this.transcriptGCList.stream().mapToDouble(GCResult::getRegionLength).toArray());
            return (int)Math.round(result);
        }

        public int getMedianGQuadruplexes() {
            return (int)Math.round(GatherGeneGCLength.getMedian(this.transcriptGCList.stream().mapToDouble(GCResult::getNumGQuadruplexesObserved).toArray()));
        }

        public int getNumTranscripts() {
            return this.transcriptGCList.size();
        }

        public Gene getGene() {
            return this.gene;
        }

        public String toString() {
            StringBuilder b = new StringBuilder();
            b.append(this.gene.toString());
            b.append(" %GC [" + GatherGeneGCLength.this.percentageFormat.format(this.getMedianGC()) + "]");
            b.append(" %G [" + GatherGeneGCLength.this.percentageFormat.format(this.getMedianG()) + "]");
            b.append(" %C [" + GatherGeneGCLength.this.percentageFormat.format(this.getMedianC()) + "]");
            b.append(" median GQuadruplex [" + this.getMedianGQuadruplexes() + "]");
            b.append(" Length [" + this.getMedianTranscriptLength() + "]");
            return b.toString();
        }
    }
}

