#input to file = genbank protein full record #output to file = output.txt containing all region sequences + #output to file = max, min length of region sequences + total number of region sequences use strict; use warnings; # Declare and initialize variables my $region_name = "Helical region"; #change this to search for other regions my $totalrecord = 2159; #change this for total number of protein records from genbank my $library = '01_helical_homosapiens_2159.gpwithparts'; #change this for the name of inputfile containing protein record from genbank my $fh; # variable to store filehandle my $record; my $dna; my $annotation; my %fields; my @features; my $offset; my $counter = 0; my $max = 0; #to find maximum sequence length of structure region my $min = 2000; my $regioncounter = 0; # Perform some standard subroutines for test $fh = open_file($library); $offset = tell($fh); open(OutputFile, ">output.txt"); #initial output file to store all sequences open(OutputLengthFile, ">outputlength.txt"); #sort output by length #print OutputFile "Here is the result of processing \n\n"; while($counter < $totalrecord) { $record = get_next_record($fh, $offset); ($annotation, $dna) = get_annotation_and_dna($record); %fields = parse_annotation($annotation); @features = parse_features($fields{'FEATURES'}); foreach my $feature (@features) { my($featurename) = ($feature =~ /^ {5}(\S+)/); #print "$featurename \n"; #print $feature; #print OutputFile $feature; if ($featurename =~ /Region/) { #print "============$featurename===============\n"; if ($feature =~ /\/region_name="$region_name"/) { (my $first, my $second) = ($feature =~ /^\s*Region\s*([0-9]+)..([0-9]+).*/); # get the start and end helical region positions print "first= ".$first."\n"; print "second= ".$second."\n"; my $subSeq = substr($dna, $first-1, ($second-$first+1)); #get substring from $dna if ((length $subSeq) > $max ) { $max = (length $subSeq) } #find maximum length of region sequences if ((length $subSeq) < $min ) { $min = (length $subSeq) } #find minimum length of region sequences print "substring = ".$subSeq."\n\n"; print OutputFile $subSeq."\n"; $regioncounter = $regioncounter + 1; } } } $offset = tell($fh); $counter = $counter + 1; } print "max length = $max \n"; print "min length = $min \n"; print "total region number = $regioncounter \n"; exit; ################################################################################ # Subroutines ################################################################################ sub open_file { my($filename) = @_; my $fh; unless(open($fh, $filename)) { print "Cannot open file $filename\n"; exit; } return $fh; } sub get_next_record { my($fh, $offset) = @_; #print "offset = ".$offset."\n"; seek($fh, $offset, 0); my($record) = ''; my($save_input_separator) = $/; $/ = "//\n"; $record = <$fh>; $/ = $save_input_separator; return $record; } sub get_annotation_and_dna { my($record) = @_; my($annotation) = ''; my($dna) = ''; # Now separate the annotation from the sequence data ($annotation, $dna) = ($record =~ /^.*(LOCUS.*ORIGIN\s*\n)(.*)\/\/\n/s); # clean the sequence of any whitespace or / characters # (the / has to be written \/ in the character class, because # / is a metacharacter, so it must be "escaped" with \) $dna =~ s/[\s\/\d]//g; return($annotation, $dna) } sub parse_annotation { my($annotation) = @_; my(%results) = ( ); while( $annotation =~ /^[A-Z].*\n(^\s.*\n)*/gm ) { my $value = $&; (my $key = $value) =~ s/^([A-Z]+).*/$1/s; $results{$key} = $value; } return %results; } sub parse_features { my($features) = @_; # entire FEATURES field in a scalar variable # Declare and initialize variables my(@features) = (); # used to store the individual features # Extract the features while( $features =~ /^ {5}\S.*\n(^ {21}\S.*\n)*/gm ) { my $feature = $&; push(@features, $feature); } return @features; }