/*
 * Decompiled with CFR 0.152.
 */
package org.broadinstitute.dropseqrna.annotation;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import htsjdk.samtools.util.StringUtil;
import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Set;
import org.broadinstitute.dropseqrna.annotation.GTFReader;
import org.broadinstitute.dropseqrna.annotation.GeneFromGTF;
import org.broadinstitute.dropseqrna.cmdline.MetaData;
import picard.PicardException;
import picard.annotation.Gene;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;

@CommandLineProgramProperties(usage="Validate reference fasta and GTF for use in Drop-Seq, and display sequences that appear in one but not the other, and display all gene_biotype values (transcript types)", usageShort="Validate reference fasta and GTF for use in Drop-Seq", programGroup=MetaData.class)
public class ValidateReference
extends CommandLineProgram {
    @Option(shortName="R", doc="The reference fasta")
    public File REFERENCE;
    @Option(doc="Gene annotation file in GTF format")
    public File GTF;
    private static final String IUPAC_CODES = "ACGTURYSWKMBDHVN";
    private static final boolean[] IUPAC_TABLE = new boolean[256];

    public static void main(String[] args) {
        new ValidateReference().instanceMainWithExit(args);
    }

    protected int doWork() {
        SAMSequenceDictionary sequenceDictionary = this.makeSequenceDictionary(this.REFERENCE);
        GTFReader gtfReader = new GTFReader(this.GTF, sequenceDictionary);
        LinkedHashSet<String> sequencesInReference = new LinkedHashSet<String>();
        for (SAMSequenceRecord s : sequenceDictionary.getSequences()) {
            sequencesInReference.add(s.getSequenceName());
        }
        Set geneAnnotations = gtfReader.load().getAll();
        LinkedHashSet<String> sequencesInGtf = new LinkedHashSet<String>();
        LinkedHashSet<String> transcriptTypes = new LinkedHashSet<String>();
        ArrayList<String> transcriptsWithNoExons = new ArrayList<String>();
        for (GeneFromGTF gene : geneAnnotations) {
            sequencesInGtf.add(gene.getContig());
            transcriptTypes.add(gene.getTranscriptType());
            for (Gene.Transcript transcript : gene) {
                if (transcript.exons.length != 0) continue;
                transcriptsWithNoExons.add(String.format("Gene %s, Transcript %s on sequence %s has no exons", gene.getGeneID(), transcript.name, gene.getContig()));
            }
        }
        if (!transcriptsWithNoExons.isEmpty()) {
            System.out.println(transcriptsWithNoExons.size() + "  transcript(s) have no exons");
            for (int i = 0; i < Math.min(100, transcriptsWithNoExons.size()); ++i) {
                System.out.println((String)transcriptsWithNoExons.get(i));
            }
        }
        this.validateReferenceBases(this.REFERENCE);
        Set<String> onlyInReference = ValidateReference.subtract(sequencesInReference, sequencesInGtf);
        Set<String> onlyInGtf = gtfReader.getUnrecognizedSequences();
        System.out.println("\nSequences only in reference FASTA:");
        this.logCollection(onlyInReference);
        System.out.println("\nSequences only in GTF:");
        this.logCollection(onlyInGtf);
        System.out.println("\ngene_biotype values:");
        this.logCollection(transcriptTypes);
        double fractionOfSequencesOnlyInReference = (double)onlyInReference.size() / (double)sequencesInReference.size();
        long sizeOfOnlyInReference = 0L;
        for (String s : onlyInReference) {
            sizeOfOnlyInReference += (long)sequenceDictionary.getSequence(s).getSequenceLength();
        }
        double fractionOfGenomeOfSequencesOnlyInReference = (double)sizeOfOnlyInReference / (double)sequenceDictionary.getReferenceLength();
        double fractionOfSequencesOnlyInGtf = (double)onlyInGtf.size() / (double)sequencesInGtf.size();
        System.out.println("\nFraction of sequences only in reference FASTA: " + fractionOfSequencesOnlyInReference);
        System.out.println("\n(Sum of lengths of sequences only in reference FASTA)/(size of genome): " + fractionOfGenomeOfSequencesOnlyInReference);
        System.out.println("\nFraction of sequences only in GTF: " + fractionOfSequencesOnlyInGtf);
        return 0;
    }

    private SAMSequenceDictionary makeSequenceDictionary(File referenceFile) {
        ReferenceSequence refSeq;
        ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory.getReferenceSequenceFile((File)referenceFile, (boolean)true);
        ArrayList<SAMSequenceRecord> ret = new ArrayList<SAMSequenceRecord>();
        HashSet<String> sequenceNames = new HashSet<String>();
        while ((refSeq = refSeqFile.nextSequence()) != null) {
            if (sequenceNames.contains(refSeq.getName())) {
                throw new PicardException("Sequence name appears more than once in reference: " + refSeq.getName());
            }
            sequenceNames.add(refSeq.getName());
            ret.add(new SAMSequenceRecord(refSeq.getName(), refSeq.length()));
        }
        return new SAMSequenceDictionary(ret);
    }

    private void validateReferenceBases(File referenceFile) {
        ReferenceSequence sequence;
        ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory.getReferenceSequenceFile((File)referenceFile, (boolean)true);
        block0: while ((sequence = refSeqFile.nextSequence()) != null) {
            for (byte base : sequence.getBases()) {
                if (IUPAC_TABLE[base]) continue;
                System.err.println(String.format("WARNING: AT least one invalid base '%c' (decimal %d) in reference sequence named %s", Character.valueOf(StringUtil.byteToChar((byte)base)), base, sequence.getName()));
                continue block0;
            }
        }
    }

    private static <T> Set<T> subtract(Set<T> setToSubtractFrom, Set<T> setToSubtract) {
        LinkedHashSet<T> ret = new LinkedHashSet<T>(setToSubtractFrom);
        ret.removeAll(setToSubtract);
        return ret;
    }

    private void logCollection(Collection<String> collection) {
        if (collection.isEmpty()) {
            System.out.println("(none)");
        } else {
            for (String s : collection) {
                System.out.println(s);
            }
        }
    }

    static {
        for (char c : IUPAC_CODES.toCharArray()) {
            ValidateReference.IUPAC_TABLE[c] = true;
            ValidateReference.IUPAC_TABLE[Character.toLowerCase((char)c)] = true;
        }
    }
}

