#!/usr/bin/perl use strict; use Data::Dumper; require XML::Simple; my %params = getParams(); opendir(DIR,"."); my @validated_files; my $status; while (my $file = readdir(DIR)) { if ($file =~ m/^chr.{1,2}\.probes\.xml\.\d+\.valid/) { if ($params{chr} eq "0") { $status = "Complete Genome"; push @validated_files, $file; } else { $status = "Chr".$params{chr}; push @validated_files, $file if ($file =~ m/chr$params{chr}\./); } } } print "Processing : $status\n"; print "Masking : "; if ($params{mask} == 0) { print "Off\n"; } else { print "On\n"; } @validated_files = sort @validated_files; my $last_chr = ""; my $current_chr = ""; my $probes; my @masked_seq; my $xml_file = new XML::Simple(); my $current_xml_read_time; for (my $i=0; $i<@validated_files; $i++) { ($current_chr) = ($validated_files[$i] =~ m/^chr(.+?)\./); if ($current_chr ne $last_chr) { #print "\nProcessing Chr$current_chr\n"; if ($last_chr ne "") { close VALID; close INVALID; # close REPROCESS; print "Masking Sequence\n"; maskSequence("chr$last_chr.axte",@masked_seq); @masked_seq = (); rename ("chr$current_chr.reprocess", "chr$current_chr.reprocess.orginal") if ( -z "chr$current_chr.reprocess" ); } $last_chr = $current_chr; my $chr_file = "chr$current_chr.probes.xml"; $probes = $xml_file->XMLin($chr_file, keyattr => {}); open(VALID,">>chr$current_chr.probes.xml.valid"); open(INVALID,">>chr$current_chr.probes.xml.invalid"); # open(REPROCESS,">>chr$current_chr.reprocess"); open(CHR,"chr$current_chr.axte"); } print "Processing --> $validated_files[$i]\n"; open(FILE, $validated_files[$i]); (my $batch_number) = ($validated_files[$i] =~ m/.+xml\.(\d+)\./); my $tmp_ctr = 0; my $mask_ctr = 0; my $avg = 0; my $probe_cnt=0; while() { my ($result, $start, $end) = ($_ =~ m/^(.+) .+,.+,(.+),(.+)$/); $tmp_ctr++; my $found = 0; my $i = $params{batchsize} * ($batch_number - 1) ; while (($found == 0) && ($i < @{$probes->{probe}})) { if (($probes->{probe}->[$i]->{species}->[0]->{alignment_start} <= $start) && ($probes->{probe}->[$i]->{species}->[0]->{alignment_end} >= $end)) { $found = 1; } else { $i++; } } $avg = $avg + $i; $probe_cnt++; if ($found == 1) { if ($result eq "VALID") { printf VALID " \n"; printf VALID " \n", $probes->{probe}->[$i]->{time}; printf VALID " %s\n",$probes->{probe}->[$i]->{date}; printf VALID " \n"; printf VALID " %d\n",$probes->{probe}->[$i]->{length}; printf VALID " %d\n",$probes->{probe}->[$i]->{alignment_identity}; printf VALID " %d\n",$probes->{probe}->[$i]->{probe_identity}; printf VALID " %d\n", $probes->{probe}->[$i]->{transversions}; printf VALID " %d\n",$probes->{probe}->[$i]->{transitions}; printf VALID " %d\n", $probes->{probe}->[$i]->{species}->[0]->{gaps}; printf VALID " %s\n", $probes->{probe}->[$i]->{probe_alignment}; printf VALID " \n"; printf VALID " %s\n",$probes->{probe}->[$i]->{species}->[0]->{name}; printf VALID " %s\n",$probes->{probe}->[$i]->{species}->[0]->{file}; printf VALID " %s\n",$probes->{probe}->[$i]->{species}->[0]->{version}; printf VALID " %d\n",$probes->{probe}->[$i]->{species}->[0]->{alignment_start}; printf VALID " %d\n",$probes->{probe}->[$i]->{species}->[0]->{alignment_end}; printf VALID " %s\n", $probes->{probe}->[$i]->{species}->[0]->{chromosome}; printf VALID " %s\n",$probes->{probe}->[$i]->{species}->[0]->{alignment_strand}; printf VALID " %d\n", $probes->{probe}->[$i]->{species}->[0]->{probe_start}; printf VALID " %d\n", $probes->{probe}->[$i]->{species}->[0]->{probe_end}; printf VALID " %s\n", $probes->{probe}->[$i]->{species}->[0]->{sequence}; printf VALID " \n"; printf VALID " \n"; printf VALID " %s\n",$probes->{probe}->[$i]->{species}->[1]->{name}; printf VALID " %s\n",$probes->{probe}->[$i]->{species}->[1]->{file}; printf VALID " %s\n",$probes->{probe}->[$i]->{species}->[1]->{version}; printf VALID " %d\n",$probes->{probe}->[$i]->{species}->[1]->{alignment_start}; printf VALID " %d\n", $probes->{probe}->[$i]->{species}->[1]->{alignment_end}; printf VALID " %s\n",$probes->{probe}->[$i]->{species}->[1]->{alignment_strand}; printf VALID " %s\n", $probes->{probe}->[$i]->{species}->[1]->{chromosome}; printf VALID " %d\n",$probes->{probe}->[$i]->{species}->[1]->{probe_start}; printf VALID " %d\n",$probes->{probe}->[$i]->{species}->[1]->{probe_end}; printf VALID " %s\n", $probes->{probe}->[$i]->{species}->[1]->{sequence}; printf VALID " \n"; printf VALID " \n\n\n"; #print " Writting probe information to valid file\n"; } else { #print " Writting probe information to invalid file\n"; #print " Mask set to [".$params{mask}."]\n"; if ($params{mask} == 1) { my $start_mask = $probes->{probe}->[$i]->{species}->[0]->{probe_start}; my $end_mask = $probes->{probe}->[$i]->{species}->[0]->{probe_end}; $mask_ctr++; #print "Mask ($mask_ctr) Adding to mask\n"; push @masked_seq, [$start_mask, $end_mask]; } # print REPROCESS $probes->{probe}->[$i]->{species}->[0]->{chromosome} . "," . $probes->{probe}->[$i]->{species}->[0]->{alignment_start}. "," . $probes->{probe}->[$i]->{species}->[0]->{alignment_end}. "\n"; printf INVALID " \n"; printf INVALID " \n", $probes->{probe}->[$i]->{time}; printf INVALID " %s\n",$probes->{probe}->[$i]->{date}; printf INVALID " \n"; printf INVALID " %d\n",$probes->{probe}->[$i]->{length}; printf INVALID " %d\n",$probes->{probe}->[$i]->{alignment_identity}; printf INVALID " %d\n",$probes->{probe}->[$i]->{probe_identity}; printf INVALID " %d\n", $probes->{probe}->[$i]->{transversions}; printf INVALID " %d\n",$probes->{probe}->[$i]->{transitions}; printf INVALID " %d\n", $probes->{probe}->[$i]->{species}->[0]->{gaps}; printf INVALID " %s\n", $probes->{probe}->[$i]->{probe_alignment}; printf INVALID " \n"; printf INVALID " %s\n",$probes->{probe}->[$i]->{species}->[0]->{name}; printf INVALID " %s\n",$probes->{probe}->[$i]->{species}->[0]->{file}; printf INVALID " %s\n",$probes->{probe}->[$i]->{species}->[0]->{version}; printf INVALID " %d\n",$probes->{probe}->[$i]->{species}->[0]->{alignment_start}; printf INVALID " %d\n",$probes->{probe}->[$i]->{species}->[0]->{alignment_end}; printf INVALID " %s\n", $probes->{probe}->[$i]->{species}->[0]->{chromosome}; printf INVALID " %s\n",$probes->{probe}->[$i]->{species}->[0]->{alignment_strand}; printf INVALID " %d\n", $probes->{probe}->[$i]->{species}->[0]->{probe_start}; printf INVALID " %d\n", $probes->{probe}->[$i]->{species}->[0]->{probe_end}; printf INVALID " %s\n", $probes->{probe}->[$i]->{species}->[0]->{sequence}; printf INVALID " \n"; printf INVALID " \n"; printf INVALID " %s\n",$probes->{probe}->[$i]->{species}->[1]->{name}; printf INVALID " %s\n",$probes->{probe}->[$i]->{species}->[1]->{file}; printf INVALID " %s\n",$probes->{probe}->[$i]->{species}->[1]->{version}; printf INVALID " %d\n",$probes->{probe}->[$i]->{species}->[1]->{alignment_start}; printf INVALID " %d\n", $probes->{probe}->[$i]->{species}->[1]->{alignment_end}; printf INVALID " %s\n",$probes->{probe}->[$i]->{species}->[1]->{alignment_strand}; printf INVALID " %s\n", $probes->{probe}->[$i]->{species}->[1]->{chromosome}; printf INVALID " %d\n",$probes->{probe}->[$i]->{species}->[1]->{probe_start}; printf INVALID " %d\n",$probes->{probe}->[$i]->{species}->[1]->{probe_end}; printf INVALID " %s\n", $probes->{probe}->[$i]->{species}->[1]->{sequence}; printf INVALID " \n"; printf INVALID " \n\n\n"; } } } close FILE; } maskSequence("chr$current_chr.axte",@masked_seq); ######################################################################### # # Mask Sequence - This subroutine # ######################################################################### sub maskSequence { return; my $file; my @seq; my $header; my $add_to_mini; my $mask_cnt = 0; $file= shift; foreach(@_) { push @seq, $_; } #for (my $i=0; $i<@seq; $i++) #{ print "Mask (".$seq[$i][0] ." - ". $seq[$i][1] . ")\n"; } open AXTE,$file; my $temp_file = "$file.temp"; my $mini_axte = "$file.mini"; open NEW_AXTE, ">$temp_file"; open MINI_AXTE, ">$mini_axte"; while () { # Ignore non headers next if ($_ =~ /^[^1]/); my ($start,$end) = ($_ =~ m/^1 .+? .+? .+? .+? (.+?) (.+?) /); print NEW_AXTE $_; $header = $_; $add_to_mini = 0; # Get Human Sequence $_ = ; chomp; my $masked_line = $_; for (my $i=0; $i<@seq; $i++) { if (($start <= $seq[$i][0]) && ($seq[$i][1] <= $end)) { $add_to_mini = 1; #print "MASK -> $masked_line\n";; $mask_cnt++; #print " $mask_cnt : Masked Line ($start - $end)\n"; # Determine MIN number of BP BEFORE the probe my $length = $seq[$i][0] - $start ; my $mask_length = $seq[$i][1] - $seq[$i][0] + 1; my $temp = substr $masked_line, 0, $length; my $gaps = ($temp =~ tr/-//); my $count = length($temp) - $gaps; while ($count != $length) { $temp = substr $masked_line, 0, ($length + $gaps); $gaps = ($temp =~ tr/-//); $count = length($temp) - $gaps; } my $leading_seq = $temp; my $probe_offset = length($leading_seq); $temp = substr($masked_line, $probe_offset, $mask_length); $gaps = ($temp =~ tr/-//); $count = length($temp) - $gaps; while ($count != $mask_length) { $temp = substr $masked_line, $probe_offset, ($mask_length + $gaps); $gaps = ($temp =~ tr/-//); $count = length($temp) - $gaps; } $temp =~ tr/ATGC/NNNN/; my $mask = $temp; my $trailing_offset = length($leading_seq) + length($mask); my $trailing_seq = substr $masked_line, $trailing_offset; $masked_line = $leading_seq . $mask . $trailing_seq; #last; } } print NEW_AXTE "$masked_line\n"; if ($add_to_mini == 1) { #print " Adding info to mini\n"; print MINI_AXTE $header; print MINI_AXTE "$masked_line\n"; } $_ = ; print NEW_AXTE "$_\n"; print MINI_AXTE "$_\n" if ($add_to_mini == 1); } close(NEW_AXTE); close(MINI_AXTE); #print " --> Chr Done, renaming $temp_file -> $file\n"; rename $temp_file, $file } ######################################################################### # getParams - gets all options from the command line or keeps defaults # dies if required options are not included # ######################################################################### sub getParams { my %opts; my $errstr = ""; use Getopt::Long; use Data::Dumper; GetOptions(\%opts, 'mask','help','chr:s','batchsize:i'); if ($opts{help}) { $errstr = " "; } if ($errstr) { print <<"END_USAGE"; Description: This script will take an XML probe file and based upon the results from megablast, will seperate the probes in to a XML.VALID or XML.INVALID Usage: separate_probes.pl [-mask] Probe Sources: -mask OPTIONAL - if set, the probes that are invalid will be masked out of the axte file. END_USAGE print "\n$errstr" if ($errstr); die("\n"); } return ( mask => $opts{mask} || 0, chr => $opts{chr} || "0", batchsize => $opts{batchsize} || 0 ); }