#input to file = genbank protein full record
#output to file = output.txt containing all region sequences + 
#output to file = max, min length of region sequences + total number of region sequences

use strict;
use warnings;

# Declare and initialize variables
my $region_name = "Helical region"; #change this to search for other regions
my $totalrecord = 2159;		#change this for total number of protein records from genbank
my $library = '01_helical_homosapiens_2159.gpwithparts';	#change this for the name of inputfile containing protein record from genbank

my $fh; # variable to store filehandle
my $record;
my $dna;
my $annotation;
my %fields;
my @features;
my $offset;
my $counter = 0;

my $max = 0;		#to find maximum sequence length of structure region
my $min = 2000;
my $regioncounter = 0;


# Perform some standard subroutines for test
$fh = open_file($library);
$offset = tell($fh);

open(OutputFile, ">output.txt");   #initial output file  to store all sequences
open(OutputLengthFile, ">outputlength.txt"); #sort output by length
#print OutputFile "Here is the result of processing \n\n";

while($counter < $totalrecord)  {
	$record = get_next_record($fh, $offset);
	($annotation, $dna) = get_annotation_and_dna($record);
	%fields = parse_annotation($annotation);
	@features = parse_features($fields{'FEATURES'});
	
	foreach my $feature (@features) {
      my($featurename) = ($feature =~ /^ {5}(\S+)/);
      #print "$featurename \n";
      #print $feature;
	  #print OutputFile $feature;
	  
	  if ($featurename =~ /Region/) {
		#print "============$featurename===============\n";
		if ($feature =~ /\/region_name="$region_name"/) {
			(my $first, my $second) = ($feature =~ /^\s*Region\s*([0-9]+)..([0-9]+).*/);  
			# get the start and end helical region positions
			print "first= ".$first."\n";
			print "second= ".$second."\n";
			my $subSeq = substr($dna, $first-1, ($second-$first+1));  #get substring from $dna
			
			if ((length $subSeq) > $max ) {
				$max = (length $subSeq)
			}	#find maximum length of region sequences
			if ((length $subSeq) < $min ) {
				$min = (length $subSeq)
			}	#find minimum length of region sequences
			
			print "substring = ".$subSeq."\n\n";
			print OutputFile $subSeq."\n";
			
			$regioncounter = $regioncounter + 1;
		}
	  }
	}

	$offset = tell($fh);
	$counter = $counter + 1;
}

	
print "max length = $max \n";
print "min length = $min \n";
print "total region number = $regioncounter \n";
	
exit;

################################################################################
# Subroutines
################################################################################

sub open_file {
    my($filename) = @_;
    my $fh;

    unless(open($fh, $filename)) {
        print "Cannot open file $filename\n";
        exit;
    }
    return $fh;
}


sub get_next_record {
    my($fh, $offset) = @_;
	#print "offset = ".$offset."\n";
	seek($fh, $offset, 0);

    my($record) = '';
    my($save_input_separator) = $/;

    $/ = "//\n";

    $record = <$fh>;
    $/ = $save_input_separator;

    return $record;
}


sub get_annotation_and_dna {

    my($record) = @_;
	
    my($annotation) = '';
    my($dna) = '';

    # Now separate the annotation from the sequence data
    ($annotation, $dna) = ($record =~ /^.*(LOCUS.*ORIGIN\s*\n)(.*)\/\/\n/s);

    # clean the sequence of any whitespace or / characters 
    #  (the / has to be written \/ in the character class, because
    #   / is a metacharacter, so it must be "escaped" with \)
    $dna =~ s/[\s\/\d]//g;

    return($annotation, $dna)
}

sub parse_annotation {
    my($annotation) = @_; 
    my(%results) = (  );

    while( $annotation =~ /^[A-Z].*\n(^\s.*\n)*/gm ) {
        my $value = $&;
        (my $key = $value) =~ s/^([A-Z]+).*/$1/s;
        $results{$key} = $value;
    }

    return %results;
}

sub parse_features {
     my($features) = @_;   # entire FEATURES field in a scalar variable

     # Declare and initialize variables
     my(@features) = ();   # used to store the individual features

     # Extract the features
     while( $features =~ /^ {5}\S.*\n(^ {21}\S.*\n)*/gm ) {
         my $feature = $&;
     push(@features, $feature);
     }

     return @features;
}