#! /usr/bin/perl

use warnings;

my $package_version = "2021-02-22";

use Getopt::Long;
Getopt::Long::Configure(qw(no_auto_abbrev no_ignore_case_always));

GetOptions(
    'R|exclude-readthrough' => \$exclude_readthrough_p,
    'E|exons' => \$exons_p,
    'C|cds' => \$cds_p,
    );

if (!defined($exclude_readthrough_p)) {
    $exclude_readthrough_p = 0;
}

if (!defined($exons_p)) {
    $exons_p = 1;
}

if (!defined($cds_p)) {
    $cds_p = 0;
}


$gene_name = "";
$last_transcript_id = "";
$last_cds_id = "";
$cds_transcript_id = "";
@exons = ();
@CDS_regions = ();
while (defined($line = <>)) {
    if ($line =~ /^\#/) {
	# Skip
    } elsif ($line !~ /\S/) {
	# Skip blank line
    } else {
	$line =~ s/\r\n/\n/;
	chop $line;
	@fields = split /\t/,$line;

	if ($fields[2] eq "gene") {
	    if ($exons_p == 1 && $#exons >= 0 &&
		($exclude_readthrough_p == 0 || $readthrough_p == 0)) {
		# Handle last mRNA of previous gene
		print_gene(\@exons,$gene_name,$last_transcript_id,$chr,$strand);
	    }
	    @exons = ();

	    if ($cds_p == 1 && $#CDS_regions >= 0 &&
		($exclude_readthrough_p == 0 || $readthrough_p == 0)) {
		# Handle last mRNA of previous gene
		print_gene(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand);
	    }
	    @CDS_regions = ();

	    ($gene_name) = $fields[8] =~ /gene_name=([^;]+)/;
	    if (!defined($gene_name)) {
		($gene_name) = $fields[8] =~ /ID=([^;]+)/;
	    }
	    $chr = $fields[0];
	    
	} elsif ($fields[2] eq "mRNA" || $fields[2] eq "transcript" || $fields[2] eq "ncRNA") {
	    if ($exons_p == 1 && $#exons >= 0 &&
		($exclude_readthrough_p == 0 || $readthrough_p == 0)) {
		print_gene(\@exons,$gene_name,$last_transcript_id,$chr,$strand);
	    }
	    @exons = ();

	    if ($cds_p == 1 && $#CDS_regions >= 0 &&
		($exclude_readthrough_p == 0 || $readthrough_p == 0)) {
		# Should this be $last_transcript_id or $cds_transcript_id?
		print_gene(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand);
	    }
	    @CDS_regions = ();

	    ($last_transcript_id) = $fields[8] =~ /ID=([^;]+)/;
	    $strand = $fields[6];
	    
	    # For GFF3 files without a gene line
	    if (!defined($gene_name) || $gene_name !~ /\S/) {
		$gene_name = $last_transcript_id;
	    }
	    if (!defined($chr) || $chr !~ /\S/) {
		$chr = $fields[0];
	    }

	    if (($readthrough_p = has_tag_p(\@fields,"readthrough_transcript")) == 1) {
		print STDERR "Excluding $last_transcript_id in gene $gene_name because it is a readthrough_transcript\n";
	    }

	} elsif ($fields[2] eq "exon") {
	    ($parent) = $fields[8] =~ /Parent=([^;]+)/;
	    if (!defined($parent) || $parent eq $last_transcript_id) {
		push @exons,"$fields[3] $fields[4]";
	    }

	} elsif ($fields[2] eq "CDS") {
	    ($cds_id) = $fields[8] =~ /ID=([^;]+)/;
	    if (!defined($cds_id)) {
		push @CDS_regions,"$fields[3] $fields[4]";
	    } else {		
		if ($cds_id ne $last_cds_id) {
		    if ($cds_p == 1 && $#CDS_regions > 0 &&
			($exclude_readthrough_p == 0 || $readthrough_p == 0)) {
			print_gene(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand);
		    }
		    @CDS_regions = (); # Need to clear single CDS regions
		}
		push @CDS_regions,"$fields[3] $fields[4]";
		$last_cds_id = $cds_id;
	    }
	    ($cds_transcript_id) = $fields[8] =~ /Parent=([^;]+)/;
	    if (!defined($cds_transcript_id)) {
		$cds_transcript_id = $last_transcript_id;
	    }
	}
    }
}

if ($exons_p == 1 && $#exons >= 0 &&
    ($exclude_readthrough_p == 0 || $readthrough_p == 0)) {
    print_gene(\@exons,$gene_name,$last_transcript_id,$chr,$strand);
}
# @exons = ();

if ($cds_p == 1 && $#CDS_regions >= 0 &&
    ($exclude_readthrough_p == 0 || $readthrough_p == 0)) {
    # Should this be $last_transcript_id or $cds_transcript_id?
    print_gene(\@CDS_regions,$gene_name,$cds_transcript_id,$chr,$strand);
}
# @CDS_regions = ();

exit;


sub has_tag_p {
    my ($fields, $desired_tag) = @_;
    my ($tags, $tag);

    ($tags) = $ {$fields}[8] =~ /tag=([^;]+)/;
    if (!defined($tags)) {
	return 0;
    } else {
	foreach $tag (split ",",$tags) {
	    if ($tag eq $desired_tag) {
		return 1;
	    }
	}
	return 0;
    }
}


sub ascending_cmp {
    ($starta) = $a =~ /(\d+) \d+/;
    ($startb) = $b =~ /(\d+) \d+/;
    return $starta <=> $startb;
}

sub get_gene_bounds_plus {
    my ($exons) = @_;
    my ($start,$end);
    my @sorted;

    @sorted = sort ascending_cmp (@ {$exons});
    ($start) = $sorted[0] =~ /(\d+) \d+/;
    ($end) = $sorted[$#sorted] =~ /\d+ (\d+)/;
    return ($start,$end);
}

sub get_gene_bounds_minus {
    my ($exons) = @_;
    my ($start,$end);
    my @sorted;

    @sorted = reverse sort ascending_cmp (@ {$exons});
    ($end) = $sorted[0] =~ /\d+ (\d+)/;
    ($start) = $sorted[$#sorted] =~ /(\d+) \d+/;
    return ($start,$end);
}


sub get_exon_bounds_plus {
    my ($exons) = @_;
    my @starts = ();
    my @ends = ();

    foreach $exon (sort ascending_cmp (@ {$exons})) {
	($start,$end) = $exon =~ /(\d+) (\d+)/;
	push @starts,$start;
	push @ends,$end;
    }

    return (\@starts,\@ends);
}

sub get_exon_bounds_minus {
    my ($exons) = @_;
    my @starts = ();
    my @ends = ();

    foreach $exon (reverse sort ascending_cmp (@ {$exons})) {
	($start,$end) = $exon =~ /(\d+) (\d+)/;
	push @starts,$start;
	push @ends,$end;
    }

    return (\@starts,\@ends);
}


# $regions can be exons or CDS regions
sub print_gene {
    my ($regions, $gene_name, $transcript_id, $chr, $strand) = @_;
    my $nregions;
    my ($starts, $ends);

    $transcript_id =~ s/transcript://;
    $gene_name =~ s/gene://;

    if ($strand eq "+") {
	($start,$end) = get_gene_bounds_plus(\@exons);
	printf ">$transcript_id $chr:%u..%u\n",$start,$end;
    } elsif ($strand eq "-") {
	($start,$end) = get_gene_bounds_minus(\@exons);
	printf ">$transcript_id $chr:%u..%u\n",$end,$start;
    } else {
	die "strand $strand";
    }

    print "$gene_name\n";

    $nregions = $#{$regions} + 1;
    if ($strand eq "+") {
	($starts,$ends) = get_exon_bounds_plus($regions);
	for ($i = 0; $i < $nregions; $i++) {
	    printf "%u %u\n",$ {$starts}[$i],$ {$ends}[$i];
	}
    } elsif ($strand eq "-") {
	($starts,$ends) = get_exon_bounds_minus($regions);
	for ($i = 0; $i < $nregions; $i++) {
	    printf "%u %u\n",$ {$ends}[$i],$ {$starts}[$i];
	}
    }
    
    return;
}

