#!/usr/bin/perl use strict; use subs; use POSIX; use Data::Dumper; use Bio::SeqIO; require XML::Simple; my %args = getParams(); my @results; my $target; my @target_species; my @probe_files; ####################################################### # # BEGIN MAIN # ####################################################### # Set parameters that control usage @target_species = getSpecies($args{species}); @probe_files = getProbeFiles($args{file}); ($target) = ($probe_files[0] =~ m/human.+?_(T.)\./); # Display script settings. if ($args{verbose}) { print "\nProcessing Species\n"; for (my $i=0; $i<@target_species; $i++) { print " $target_species[$i]\n"; } print "\n"; print "Probe Files\n"; for (my $i=0; $i<@probe_files; $i++) { print " $probe_files[$i]\n"; } print "\n"; print "Target [$target]\n\n"; } # Add all probes in the XML file to the the results array for (my $i=0; $i<@probe_files; $i++) { my $xs = new XML::Simple(); (my $temp_species) = ($probe_files[$i] =~ m/^human_(.*?)_T/); print "Loading[".$probe_files[$i]."] -> [$temp_species]..."; my $probe_analysis= $xs->XMLin($probe_files[$i], keyattr => {}); print "Loaded\n"; # Add all the probes to an array if ($args{xml} == 1) { for (my $i=0; $i<@{$probe_analysis->{parent_alignment}}; $i++) { my $sequence = $probe_analysis->{parent_alignment}->[$i]->{probe}->{sequence}; my $start = $probe_analysis->{parent_alignment}->[$i]->{species}->[0]->{start}; my $probe_identity = $probe_analysis->{parent_alignment}->[$i]->{probe}->{probe_identity}; $start = $start + $probe_analysis->{parent_alignment}->[$i]->{probe}->{offset}; my $end = $start + length($sequence) - 1; push @results , [$start,$end,$temp_species,$sequence, $probe_identity]; print "Start [$start] End [$end] Species [$temp_species] Seq [$sequence] Length (".length($sequence).")\n" if ($args{verbose}); } } else { for (my $i=0; $i<@{$probe_analysis->{probe}}; $i++) { my $start = $probe_analysis->{probe}->[$i]->{species}->[0]->{probe_start}; my $end = $probe_analysis->{probe}->[$i]->{species}->[0]->{probe_end}; my $sequence = $probe_analysis->{probe}->[$i]->{species}->[0]->{sequence}; my $probe_identity = $probe_analysis->{probe}->[$i]->{probe_identity}; push @results , [$start,$end,$temp_species,$sequence, $probe_identity]; print "Start [$start] End [$end] Species [$temp_species] Seq [$sequence]\n" if ($args{verbose}); } } } print "\n\n###############################################\n" if ($args{verbose}); print " All probes loaded\n" if ($args{verbose}); print "###############################################\n\n\n" if ($args{verbose}); # Sort Probes by starting base pair. @results = sort { $a->[0] <=> $b->[0] } @results; # Hybridize the sequences against the other target species print "Number of target species [".@target_species."]\n\n" if ($args{verbose}); for (my $species_idx=0; $species_idx<@target_species; $species_idx++) { # Open up idx file to get offset of non-gapped alignment my $blastz_idx_file="human_" . $target_species[$species_idx] . "_" . $target . ".blastz.sc.idx"; my $fasta_file=$target_species[$species_idx] . "_" . $target . ".fasta"; my $species_name= $target_species[$species_idx]; my @idx_info; # Temp variable to hold 1 line from blastz.idx file my $t1 = 0; my $t2 = 0; open(IDX,$blastz_idx_file); print "\n########################################\n" if ($args{verbose}); print " Hybridization with [$species_name]\n" if ($args{verbose}); print "########################################\n\n" if ($args{verbose}); for (my $i=0; $i<@results; $i++) { my $description_fp = 0; # File pointer to description of alignment my $alignment_fp = 0; # File pointer to closest alignment my $probe_start_bp = $results[$i][0]; # location in human seq of start of probe my $probe_end_bp = $results[$i][1]; # Location in human seq of end of probe my $bp_found = 0; # number of start and stop locations found my $current_line_idx = 0; print "Search for alignment that contains the probe : [$probe_start_bp] - [$probe_end_bp]\n" if ($args{verbose}); while () { $current_line_idx = 0 - length($_); chomp; (@idx_info) = split /,/,$_; my $index_start_bp = $idx_info[0]; # Location in human seq of a local alignment my $index_end_bp = $idx_info[1]; # Location in human seq of the end of alignemt if (($index_start_bp <= $probe_start_bp) && ($probe_start_bp <= $index_end_bp)) { if ($probe_end_bp <= $index_end_bp) # Make sure the probe doesn't strech out of alignment { $description_fp = $idx_info[2]; # File pointer in blastz file of description print "Description fp set to [$description_fp]\n" if ($args{verbose}); $alignment_fp = $idx_info[3]; print "Found alignment [$index_start_bp] - [$index_end_bp] (fp : $alignment_fp)\n" if ($args{verbose}); #loop throught remaining non-gapped alignments to find closest filepointer. for (my $j=4; $j<@idx_info; $j=$j+2) { if ($idx_info[$j] > $probe_start_bp) { print "non-gapped start (".$idx_info[$j].") is past probe start, use last non-gapped (fp : $alignment_fp)\n" if ($args{verbose}); last; } print "Closest non-gapped start [".$idx_info[$j]."] (fp : ".$idx_info[$j+1].")\n" if ($args{verbose}); $alignment_fp = $idx_info[$j+1]; } #Found file pointer info from index file, break loop last; } } last if ($probe_start_bp < $index_start_bp); } if ($description_fp > 0) { $t1++; } else { $t2++; } seek IDX,$current_line_idx,1; print "File Pointers used : Desc [$description_fp] Alignment [$alignment_fp]\n" if ($args{verbose}); push @{$results[$i]}, $species_name; push @{$results[$i]}, $description_fp; push @{$results[$i]}, $alignment_fp; } close IDX; print " END SEARCHING INDEX FILE\n\n" if ($args{verbose}); my $blastz_file = "human_" . $target_species[$species_idx] . "_" . $target . ".blastz.sc"; open (BLASTZ,$blastz_file); my $temp1 = 0; my $temp2 = 0; for (my $i=0; $i<@results; $i++) { my $start_bp = $results[$i][0]; my $end_bp = $results[$i][1]; my $start_idx = $#{$results[$i]} - 4; my $desc_offset = $results[$i][$#{$results[$i]} -1]; my $alignment_offset = $results[$i][$#{$results[$i]}]; my $sequence = $results[$i][3]; my $description = "NONE"; my $species_start_bp = 0; my $species_end_bp = 0; my $alignment_mask = ""; my $reverse_complement = 0; if ($desc_offset > 0) { print "Probe Start [$start_bp] End [$end_bp]\n" if ($args{verbose}); seek BLASTZ,$desc_offset,0; $_ = ; chomp; # Skipping the AC0000000 number in the comment line, since Bioperl doesn't pick it up by default. ($description) = ($_ =~ m/">.*? (.+)"/); print "Desc [$description]\n" if ($args{verbose}); if (index($description, "reverse complement") != -1) { ($description) = ($description =~ m/(.*) \(reverse complement/); $reverse_complement = 1; print "Reverse Compliment \n" if ($args{verbose}); } seek BLASTZ, $alignment_offset, 0; while () { last if ($_ =~ m/^a {/); next if !($_ =~ m/^ l /); $_ =~ s/^\s+//; $_ =~ s/\s+$//; my @blast_line = split / /,($_); my $blast_human_start = $blast_line[1]; my $blast_human_end = $blast_line[3]; my $blast_species_start = $blast_line[2]; my $blast_species_end = $blast_line[4]; if ((($blast_human_start <= $start_bp) && ($start_bp <= $blast_human_end)) || (($start_bp < $blast_human_start) && ($end_bp >= $blast_human_start))) { print $_."\n"; my $offset = $start_bp - $blast_human_start; print "Offset [$offset]\n"; $species_end_bp = 0; #$species_start_bp = $blast_species_start + $offset; my $temp_sequence=""; if ($offset < 0) { print "Adding Human Insertions\n"; for (my $i=$offset; $i<0; $i++) { $temp_sequence .= "^"; } $offset = 0; $start_bp = $blast_human_start; $species_start_bp = $blast_species_start; } else { $species_start_bp = $blast_species_start + $offset; } my $length = 0; if ($end_bp < $blast_human_end) { print "Fill remaining with |'s\n"; my $num_leading_insertions = length($temp_sequence); for (my $i=0; $i $blast_species_end) { $_ = ; last if !($_ =~ m/^ l/); $_ =~ s/^\s+//; $_ =~ s/\s+$//; my $end_last_nongapped_human = $blast_human_end; my $end_last_nongapped_species = $blast_species_end; @blast_line = split / /,($_); $blast_human_start = $blast_line[1]; $blast_human_end = $blast_line[3]; $blast_species_start = $blast_line[2]; $blast_species_end = $blast_line[4]; # Temp_sequence will hold characters that represent the location of sequence. # | = base pair exists in both human and species sequence # ^ = Human Insertion: base pair exists in human only, and is a gap in the species # v = Species Insertion: base pair exists in species only, and is a gap in the human # Determine the number of human inserts my $gap_distance; if ($blast_human_start < $end_bp) { $gap_distance = $blast_human_start - $end_last_nongapped_human - 1; } else { $gap_distance = $end_bp - $end_last_nongapped_human - 1; } for (my $i=0; $i<$gap_distance; $i++) { $temp_sequence .= "^"; } if ($blast_human_start < $end_bp) { $gap_distance = $blast_species_start - $end_last_nongapped_species - 1; for (my $i=0; $i<$gap_distance; $i++) { $temp_sequence .= "v"; } } if ($end_bp < $blast_human_end) { my $nongapped_distance = $end_bp - $blast_human_start + 1; for (my $i=0; $i<$nongapped_distance; $i++) { $temp_sequence .= "|"; } $length = $length + $nongapped_distance; last; } else { my $nongapped_distance = $blast_human_end - $blast_human_start + 1; for (my $i=0; $i<$nongapped_distance; $i++) { $temp_sequence .= "|"; } $length = $length + ($blast_human_end - $blast_human_start); } } } (my $tmp_ctr) = ($temp_sequence =~ tr/|v//); print "Mask [$temp_sequence] Mouse Size [$tmp_ctr]\n"; $species_end_bp = $species_start_bp + ($temp_sequence =~ tr/|v//) - 1; print "Species_start_bp [$species_start_bp] species_end_bp [$species_end_bp]\n"; $alignment_mask = $temp_sequence; } } # While BLASTZ } # If desc_offset if (($species_start_bp == 0) && ($species_end_bp == 0)) { $description = "NONE"; $temp1++ } else { $temp2++ } push @{$results[$i]}, $description; push @{$results[$i]}, $reverse_complement; push @{$results[$i]}, $species_start_bp; push @{$results[$i]}, $species_end_bp; push @{$results[$i]}, $alignment_mask; push @{$results[$i]}, ""; # Place holder for species1 aligned sequence push @{$results[$i]}, ""; # Place holder for species2 aligned sequence print "RESULTS SAVED:\n" if ($args{verbose}); print "==============\n" if ($args{verbose}); print "Desc [$description]\n" if ($args{verbose}); print "Rev Comp [$reverse_complement]\n" if ($args{verbose}); print "Target Start [$species_start_bp]\n" if ($args{verbose}); print "Target End [$species_end_bp]\n" if ($args{verbose}); print "Mask [$alignment_mask]\n" if ($args{verbose}); print "\n\n" if ($args{verbose}); } # Order by Description, for easier lookup. Not sorting properly, aaaab,aaaa,aaaa,aaaa --> aaaa,aaaab,aaaa,aaaa #@results = sort { $a->[7] ne $b->[7] } @results; my $in = new Bio::SeqIO ( -format => 'fasta', -file => $fasta_file); while (my $seq = $in->next_seq) { for (my $i=0; $i<@results; $i++) { my $species2_dna = ""; if ($seq->desc eq $results[$i][$#{$results[$i]} - 6]) { my $temp_desc = $results[$i][$#{$results[$i]} - 6]; print "FASTA [".$seq->desc."]\n" if ($args{verbose}); print "Searching [$temp_desc]\n" if ($args{verbose}); my $reverse_complement_idx = $#{$results[$i]} - 5; my $start_idx = $#{$results[$i]} - 4; my $end_idx = $#{$results[$i]} - 3; my $mask_idx = $#{$results[$i]} - 2; my $species1_align_idx = $#{$results[$i]} - 1; my $species2_align_idx = $#{$results[$i]}; my $reverse_complement = $results[$i][$reverse_complement_idx]; my $start; my $end; if ($reverse_complement == 0) { $start = $results[$i][$start_idx]; $end = $results[$i][$end_idx]; } else { print "start [".$results[$i][$start_idx]."]\n"; print "end [".$results[$i][$end_idx]."]\n"; $end= $seq->length - $results[$i][$start_idx] + 1; $start = $seq->length - $results[$i][$end_idx] + 1; print "Start [$start] End [$end]\n"; } my $mask = $results[$i][$mask_idx]; my $species1_dna = $results[$i][3]; if (($start == 0) || ($seq->length < $end)) { #print "-------------------- ERROR START ---------------------\n"; #print "Skipping Probe:\n"; #print "Start [$start]\n"; #print "Fasta Description [". $seq->desc . "]\n"; #print "Fasta Length [" .$seq->length."]\n"; #print Dumper(\$results[$i]); #print "--------------------- ERROR END ----------------------\n"; #print "\n\n"; # Push empty sequence for human & species } else { print "Final Species Start[$start] End [$end] Length[".$seq->length."]\n"; $species2_dna = $seq->subseq($start,$end); print "S1 DNA [$species1_dna]\n" if ($args{verbose}); print "S2 DNA [$species2_dna] Initial\n" if ($args{verbose}); $species2_dna = reverse_complement($species2_dna) if ($reverse_complement == 1); print "S2 DNA [$species2_dna] Final\n" if ($args{verbose}); my $species1_align=""; my $species2_align=""; my $sp1_idx=0; my $sp2_idx=0; for (my $x=0; $x < length($mask); $x++) { my $bp_mask = substr($mask,$x,1); if ($bp_mask eq "^") { $species1_align .= substr($species1_dna, $sp1_idx,1); $sp1_idx++; $species2_align .= " "; } elsif ($bp_mask eq "v") { $species2_align .= substr($species2_dna, $sp2_idx,1); $sp2_idx++; $species1_align .= " "; } else { $species1_align .= substr($species1_dna, $sp1_idx,1); $sp1_idx++; $species2_align .= substr($species2_dna, $sp2_idx,1); $sp2_idx++; } } $results[$i][$species1_align_idx] = $species1_align; $results[$i][$species2_align_idx] = $species2_align; print "\n" if ($args{verbose}); } } } } print "\n\n" if ($args{verbose}); } # Print out the results in XML Format print "Opening [".$args{outfile}."] for xml output\n" if ($args{verbose}); open(OUTFILE,">".$args{outfile}); print OUTFILE "\n"; print OUTFILE "\n"; for (my $i=0; $i<@results; $i++) { print OUTFILE " \n"; print OUTFILE " ".$results[$i][2]."\n"; print OUTFILE " ".$results[$i][3]."\n"; print OUTFILE " ".$results[$i][0]."\n"; print OUTFILE " ".$results[$i][1]."\n"; print OUTFILE " ".$results[$i][4]."\n"; for (my $j=5; $j<@{$results[$i]}; $j=$j+10) { print OUTFILE " \n"; print OUTFILE " ".$results[$i][$j]."\n"; print OUTFILE " ".$results[$i][$j+5]."\n"; print OUTFILE " ".$results[$i][$j+6]."\n"; print OUTFILE " ".$results[$i][$j+8]."\n"; my $mask = standardize_mask($results[$i][$j+7],$results[$i][$j+8],$results[$i][$j+9]); print OUTFILE " $mask\n"; print OUTFILE " ".$results[$i][$j+9]."\n"; print OUTFILE " \n"; } print OUTFILE " \n"; } print OUTFILE "\n"; close OUTFILE; ################################################################## # # END OF MAIN CODE # ################################################################## ####################################### # ####################################### sub standardize_mask { my ($mask, $seq1, $seq2) = (@_); my $tmp_mask = ""; for (my $i=0; $i -species [-outfile ] [-verbose] [-xml ] This script will take a probe file and generate a computation hybridization with the specified species. The output from this script will be an xml formated result. For this script to function, the following files need to exist: ??????.fasta (for every species in -species) human.masked (human fasta file, must be ??????.masked) human_?????_T?.blastz.sc (for every species in -species) human_?????_T?.blastz.sc.idx (for every species in -species) human_?????_T?.verbose.probes.xml (for the source species, not for -species) Probe Sources: -species Name of species to hybridize or all for every species -file file containing probes or ALL for every probe file (format : human_????_T?.verbose.probes.xml) -verbose Prints status messages as script executes -outfile Name of output file, defaults to hybridization.xml -xml This specifies the xml version being read (either 1 or 2) END_USAGE print "\n$errstr" if ($errstr); die("\n"); } (my $source_species) = ($opts{file} =~ m/^.+?_(.+?)_/); return (species => $opts{species}, file => $opts{file}, outfile => $opts{outfile} || "hybridization.xml", verbose => $opts{verbose} || 0, xml => $opts{xml} || 2, source_species => $source_species); } ############################################################# # getSpecies - Determine which species will be processed ############################################################# sub getSpecies { # Get all the target species to hybridiza against my @target_species; my $species_arg = shift; if ($species_arg eq "all") { opendir(TARGET,"."); foreach my $file ( grep /$._T..fasta/, readdir(TARGET)) { ($target) = ($file =~ m/^.+?_(T.)/); (my $fasta_species) = ($file =~ m/^(.+?)_T/); print "[$target] [$fasta_species]\n"; if (( -f "human_".$fasta_species."_".$target.".blastz.sc") && ( -f "human_".$fasta_species."_".$target.".blastz.sc.idx")) { push @target_species, $fasta_species; } } closedir(TARGET); } else { push @target_species, $args{species}; } return @target_species; } ################################################## # reverse_compliment - make the reverse compliment # from a sequence ################################################## sub reverse_complement { my $dna = shift; $dna = reverse($dna); $dna =~ tr/ATGCatgc/TACGtacg/; return $dna; } ############################################## # getProbeFiles - Gets the list of files that # contain probes. ############################################## sub getProbeFiles { # Get all the probes from all the species files. my $file_arg = shift; my @probe_files; if ($file_arg eq "all") { opendir(TARGET,"."); foreach my $file ( grep /^human_.+.probes.xml$/, readdir(TARGET)) { push @probe_files, $file; } closedir(TARGET); } else { push @probe_files, $args{file}; } return @probe_files; }