#!/usr/bin/perl use strict; use subs ; use Data::Dumper; # Updates # # 2004-01-07 RTS - for build 2, must remove 'chr' prefix # from 2nd field (subject identity) before comparing # chromosome numbers # my %opts = getParams(); open(MEGA, $opts{file}); my $header = ""; my $found = 0; my $non_unique = 0; my $cutoff1 = 0; my $cutoff2 = 0; my $valid = 1; my $actual_start; my $actual_end; my $subject_chr = ""; # field 1, chromosome number while() { next if ($_ =~ m/^#/); chomp; my @line = split(" ",$_); # for build 2, get digits only from field 2 (subject ID) e.g. 'chr22' ($subject_chr) = ($line[1] =~ /^chr(.+)/); # get digits only from field 2 (e.g. chr22) #### # for build 1, do not remove 'chr' prefix ####($subject_chr) = $line[1]; print "line[1]:$line[1] subject_chr:$subject_chr \n" if ($opts{debug} == 1); #rts - debug if ($line[0] ne $header) { if ($header ne "") { if (($valid == 1) && ($found == 1)) { print "VALID $header $actual_start $actual_end\n"; } elsif ($found == 0) { print "NOT_FOUND $header $actual_start $actual_end\n" } elsif ($cutoff1 > 0) { print "UNEXPECTED $header $actual_start $actual_end\n"; } else { print "TOO_MANY $header $actual_start $actual_end\n"; } print "$header valid [$valid] found [$found] 40+ [$cutoff1] 36+ [$cutoff2]\n\n\n\n\n" if ($opts{debug} == 1); } $header = $line[0]; $non_unique = 0; $found = 0; $cutoff1 = 0; $cutoff2 = 0; $valid = 1; } my @probe = split(",",$line[0]); print "$_\n" if ($opts{debug} == 1); print "probe[1]:$probe[1] line[1]:$line[1] \n" if ($opts{debug} == 1); #rts - debug # if perfect match (same chromomsome number, high quality, same start & stop) if ( ($probe[1] eq $subject_chr) && ($line[11] > 71) && ($probe[2] == $line[8]) && ($probe[3] == $line[9]) ) { $found = 1; $actual_start = $line[8]; $actual_end = $line[9]; } # end of perfect match else # else not perfect match { if ($line[11] > 40) { #print "Unexpected match\n" if ($opts{debug} == 1); $cutoff1++; } if ($line[11] > 36) { #print "Close Match\n" if ($opts{debug} == 1); $cutoff2++; } if ((($cutoff1 >= 1) || ($cutoff2 >= 5)) && ($valid == 1)) { #print "Not Unique\n" if ($opts{debug} == 1); $valid = -1; } } } # end while() if (($valid == 1) && ($found == 1)) { print "VALID $header\n"; } elsif ($found == 0) { print "NOT_FOUND $header\n" } elsif ($cutoff1 > 0) { print "UNEXPECTED $header\n"; } else { print "TOO_MANY $header\n"; } print "$header valid [$valid] found [$found] 40+ [$cutoff1] 36+ [$cutoff2]\n\n\n\n\n" if ($opts{debug} == 1); ######################################################################### # getParams - gets all options from the command line or keeps defaults # dies if required options are not included ######################################################################### sub getParams { my %opts; my $errstr = ""; use Getopt::Long; use Data::Dumper; GetOptions(\%opts, 'file=s','debug'); if (!defined($opts{file})) { $errstr .= "ERROR : No output file specified.\n"; } if ($errstr) { print <<"END_USAGE"; Description: This script will take the output from megablast and create an output file that will denote if the probe was either unique or non-unique. Usage: validate_megablast_output.pl [options] Probe Sources: -file MegaBlast Output file in -D3 output format END_USAGE print "\n$errstr" if ($errstr); die("\n"); } return ( file => $opts{file}, debug => $opts{debug} || 0 ); } ############################################################################## # Start Description # Evaluate a file of sorted megablast output to characterize each probe as unique or non-unique. # End Description ##############################################################################