#!/usr/bin/perl # overgo picking and desgin workbench # written by Brian Carlson 2003 # # run soop -h or perldoc soop for full documentation ######################################################################### ##################### Required Modules ###################### ######################################################################### require 5.6.0; use warnings; use strict; use Data::Dumper; ######################################################################### # # FUNDAMENTAL DATA STRUCTURE # ######################################################################### # # # Generate the stucture that holds all the potential # pips that can create a probe, fill in with data. # # Pips is an array of pip_structs # # pip_struct is a hash of : # { # score -> # Multi-Species Score # probe -> array of probe structs # Actual Probe Sequence # species -> array of # species_info structs # } # # species_info is a hash of : # { # species -> # Species Name # start -> # Start Base Pair # stop -> # Stop Base Pair # sequence -> # Alignment Sequence # reservse -> "-/+" # Reverse Strand # build -> build # } # # probe_struct is a hash of : # { # # } # ######################################################################### ######################################################################### ################### Begin Globals ###################### ######################################################################### # This structure holds that distance between human and a given species # (in Millions of years) my %evolutionary_distance = ( 'human' => 0, 'mouse' => 5, 'rat' => 6, 'monkey' => 2, 'fugu' => 10 ); my @scoring_matrix; my @pips; my %args = getParams(); ######################################################################### ################### End Globals ###################### ######################################################################### ######################################################################### ################### Begin Main ###################### ######################################################################### my ($date, $time) = getExecutionTime(); debug_print("Probe generation started at $time on $date\n"); @scoring_matrix = initializeScoringMatrix($args{scoring_file}); @pips = getPipsFromFile($args{file}); @pips = restrictPipsToRange($args{min}, $args{max}, $args{range_file}, $args{species}, @pips); @pips = selectProbes($args{cutoff}, $args{probe_length}, $args{gc}, $args{tolerance}, @pips); printXML(\%args, \@pips); ######################################################################### ################### End Main ###################### ######################################################################### ######################################################################### ################### Custom Functions ###################### ######################################################################### sub printXML { my $local_args = shift; my $local_pips = shift; my $run_path; my $date; my $time; my $script_name = $0; chop ($run_path = `pwd`); ($date, $time) = getExecutionTime(); # Print out script execution settings if ($local_args->{probefile} ne "") { open (OUT, ">".$local_args->{probefile}); select OUT; } print "\n"; print "\n"; print "\n\n\n"; print "\n\n" ; # Loop through pips, and print out XML for probe, if it's reaches cutoff for (my $i=0; $i<@{$local_pips}; $i++) { if(defined($local_pips->[$i]->{probes})) { for (my $probe_idx=0; $probe_idx<@{$local_pips->[$i]->{probes}}; $probe_idx++) { print " \n"; printf " \n", $time; printf " %s\n", $date; printf " \n"; printf " %d\n", $local_args->{probe_length}; printf " %5.2f\n", $local_pips->[$i]->{probes}->[$probe_idx]->{score}; printf " \n"; printf " %s\n", $local_pips->[$i]->{target_species}; printf " %s\n", $local_args->{file}; printf " %s\n", $local_pips->[$i]->{build}; #printf " %d\n", $local_pips->[$i]->{start}; #printf " %d\n", $local_pips->[$i]->{end}; printf " %s\n", $local_pips->[$i]->{chromosome}; #printf " %s\n",$local_pips->[$i]->{target_sequence}; printf " %d\n", $local_pips->[$i]->{probes}->[$probe_idx]->{start}; printf " %d\n", $local_pips->[$i]->{probes}->[$probe_idx]->{end}; printf " %s\n", $local_pips->[$i]->{probes}->[$probe_idx]->{sequence}; printf " \n"; for (my $j=0; $j<@{$local_pips->[$i]->{probes}->[$probe_idx]->{alignments}}; $j++) { printf " \n", $j+2; printf " %s\n", $local_pips->[$i]->{probes}->[$probe_idx]->{alignments}->[$j]->{species}; printf " %s\n", $local_args->{file}; my ($chromosome, $version) = getProbeInfo($local_pips->[$i], $local_pips->[$i]->{probes}->[$probe_idx]->{alignments}->[$j]->{species}); printf " %s\n", $version; #printf " %d\n", $local_pips->[$i]->{species}->[$j]->{start}; #printf " %d\n", $local_pips->[$i]->{species}->[$j]->{end}; #printf " %s\n",$local_pips->[$i]->{species}->[$j]->{strand}; printf " %s\n", $chromosome; printf " %d\n", $local_pips->[$i]->{probes}->[$probe_idx]->{alignments}->[$j]->{start}; printf " %d\n", $local_pips->[$i]->{probes}->[$probe_idx]->{alignments}->[$j]->{end}; printf " %s\n", $local_pips->[$i]->{probes}->[$probe_idx]->{alignments}->[$j]->{sequence}; printf " \n"; } printf " \n\n\n"; } } } print "\n\n" ; if ($args{probefile} ne "") { # Reset FP so that default is STDOUT close OUT; select STDOUT; } } sub getProbeInfo { my $pip = shift; my $species = shift; my $version; my $chromosome; for (my $i=0; $i<@{$pip->{species}}; $i++) { if ($pip->{species}->[$i]->{species} eq $species) { $version = $pip->{species}->[$i]->{build}; $chromosome = $pip->{species}->[$i]->{chromosome}; last; } } return ($chromosome, $version); } sub getExecutionTime { my @time_info = localtime(time); my $min; my $sec; my $year = $time_info[5] + 1900; # Format Minutes if ($time_info[1] < 10) { $min = "0".$time_info[1]; } else { $min = $time_info[1]; } # Format Sec if ($time_info[0] < 10) { $sec = "0".$time_info[0]; } else { $sec = $time_info[0]; } my $date = $year ."-".$time_info[4]."-".$time_info[3]; my $time = $time_info[2].":$min:$sec"; return ($date, $time); } sub initializeScoringMatrix { my $scoring_file = shift; my @scoring_matrix ; if ($scoring_file ne "") { open (FILE,"<$scoring_file"); my %matrix; my $num_species=0; while () { next if ($_ =~ m/^#/); chomp; my $line = uc($_); if ($line =~ m/^ *\d+ SPECIES/) { $scoring_matrix[$num_species] = %matrix if ($num_species > 0); my @parts = split(" ",$line); $num_species = $parts[0]; } elsif ($line =~ m/^ *\d+ +[0-9.]+$/) { my @parts = split(" ",$line); $matrix{$parts[0]} = $parts[1]; } } $scoring_matrix[$num_species] = \%matrix if ($num_species > 0); close FILE; } return @scoring_matrix; } sub selectProbes { my $cutoff = shift; my $probe_length = shift; my $target_GCscore = shift; my $gc_tolerance = shift; my @pips = @_; my $probe_generated=0; # Algrothimn # 1. Scan through the targeted species pip sequence and generate a probe # from every start bp that is N bp in length. # 2. To determine if probe is better than "best", use the following criteria # in order of listing: # a. If GC Score is within tolerance of target GC Score # if not within tolerance, get next probe from #1 # b. Is the multi-score better than best multi-score, it's better get next from #1 # if not, it's not better, get next probe from #1 # c. If GC Score is closer to target than best, this probe is better, get next from #1 # if not closer, get next from #1 # # 3. Best probe is found, set probe sequence in pip_struct # # For each pip, select a probe for (my $i=0; $i<@pips; $i++) { my $best_probe_start = -1; my $best_score = -1; my $best_gc = 0; my $min_gc = $args{gc} - $args{tolerance}; my $max_gc = $args{gc} + $args{tolerance}; my $probe_selected = 0; my $status_ctr=0; my $pip_seq = $pips[$i]->{target_sequence}; #print "$pip_seq\n"; for (my $probe_start_bp=0; $probe_start_bp $args{tolerance}) { print "BAD GCDistance\n" if ($args{verbose});; next; } $probe_gc = getGCscore($potential_probe); printf "GC :[%.1f] ", ($probe_gc * 100) if ($args{verbose}); my $better_probe = 0; $probe_score = scoreProbe($probe_start_bp, $potential_probe, $pips[$i]->{species}); if ($probe_score < $cutoff) { print " Score too low\n" if ($args{verbose});; next; } $probe_selected = 1; print "Adding Probe (ALL)\n" if (($args{best_probe} == 0) && ($args{verbose})); $pips[$i] = addProbe($pips[$i], $probe_start_bp, $probe_score) if ($args{best_probe} == 0); if ($probe_score > $best_score) { $better_probe = 1; } elsif ($probe_score == $best_score) { # Same probe score, see if GC score is closer to target GC Score. my $best_distance = abs($args{gc} - $best_gc); my $probe_distance = abs($args{gc} - $probe_gc); if ($probe_distance < $best_distance) { $better_probe = 1; } } if ($better_probe == 1) { $best_probe_start = $probe_start_bp; $best_gc = $probe_gc; $best_score = $probe_score; } #print "$potential_probe [$probe_score] [$min_gc] [$probe_gc] [$max_gc]\n"; } if (($best_probe_start > -1) && ($args{best_probe} == 1)) { #print "Best Score [$best_score]\n"; print "Adding Probe (BEST)\n" if ($args{verbose});; $pips[$i] = addProbe ($pips[$i], $best_probe_start, $best_score); } $probe_generated++ if ($probe_selected == 1); #print "\n\n"; } debug_print("Number of pips with a probe selected : [$probe_generated]\n"); return @pips; } sub addProbe { my $pip = shift; my $probe_start = shift; my $score = shift; my %probe; my @elements; $probe{sequence} = substr($pip->{target_sequence}, $probe_start, $args{probe_length}); $probe{start} = $pip->{start} + $probe_start; $probe{end} = $probe{start} + $args{probe_length} - 1; $probe{species} = $pip->{target_species}; $probe{score} = $score; for (my $i=0; $i<@{$pip->{species}}; $i++) { my %probe_alignment; $probe_alignment{species} = $pip->{species}->[$i]->{species}; $probe_alignment{start} = $pip->{species}->[$i]->{start} + $probe_start; $probe_alignment{end} = $probe_alignment{start} + $args{probe_length} - 1; $probe_alignment{sequence} = substr($pip->{species}->[$i]->{sequence}, $probe_start, $args{probe_length}); push @{$probe{alignments}}, \%probe_alignment; } push @{$pip->{probes}}, \%probe; return $pip; } sub scoreProbe { my $probe_offset = shift; my $base_sequence = shift; my $species = shift; my $probe_score = 0; my $num_species = @{$species} + 1; for (my $i=0; $i[0]->{sequence}, $bp + $probe_offset, 1); my $bp3 = substr($species->[1]->{sequence}, $bp + $probe_offset, 1); # BP Example : a=a=a if (($bp1 eq $bp2) && ($bp2 eq $bp3)) { $rank = 1; } # BP EXAMPLE : A=T=C elsif (($bp1 ne $bp2) && ($bp1 ne $bp3) && ($bp2 ne $bp3)) { $rank = 5; } # BP Example : A=T=T elsif (($bp2 eq $bp3) && ($bp1 ne $bp2)) { $rank = 4; } # BP Example : A=A=T or A=T=A elsif ($bp1 eq $bp2) { my $species1 = $species->[0]->{species}; my $species1_distance = abs($evolutionary_distance{$species1} - $evolutionary_distance{$args{species}}); my $species2 = $species->[1]->{species}; my $species2_distance = abs($evolutionary_distance{$species2} - $evolutionary_distance{$args{species}}); if ($species1_distance < $species2_distance) { $rank = 3; } else { $rank = 2; } } else { my $species1 = $species->[0]->{species}; my $species1_distance = abs($evolutionary_distance{$species1} - $evolutionary_distance{$args{species}}); my $species2 = $species->[1]->{species}; my $species2_distance = abs($evolutionary_distance{$species2} - $evolutionary_distance{$args{species}}); if ($species1_distance < $species2_distance) { $rank = 2; } else { $rank = 3; } } } return $rank; } ######################################################################### # getGCscore -- gets an OvergoMaker40 style GC content score for an # overgo. # # idea for algorithm from overgomaker40 by John D. McPherson at Washington # University, St. Louis # # INPUTS: $overgo - sequence of overgo # $overlap - overlap between complimentary probes # $target_gc - target GC proportion (usually .5) # OUTPUTS: returns GC score, lower is better ######################################################################### sub getGCscore { my $overgo = uc shift; my $overlap = $args{overlap}; my $target_gc = $args{gc}; my $overgo_len = length($overgo); my $unique_len = $overgo_len / 2 - $overlap / 2; my $seq = substr($overgo, 0, $unique_len); my $score = abs($target_gc - ($seq =~ tr/GC//) / $unique_len); $seq = substr($overgo, $unique_len, $overlap); $score += abs($target_gc - ($seq =~ tr/GC//) / $overlap); $seq = substr($overgo, -$unique_len); $score += abs($target_gc - ($seq =~ tr/GC//) / $unique_len); # GC content of the edges of the overlap region is better if it's a G # or a C, but this should be less important than the GC content of # the three sections as added above. # K&R says that a float should have at least 6 significant digits. # (Damn, that was a long time ago!) $score -= ( (substr($overgo, $unique_len, 1) =~ tr/GC//) + (substr($overgo, -$unique_len-1, 1) =~ tr/GC//)) * 1E-6; return $score; } ######################################################################### # restrictPipsToRange - This function will open take an array of pips and # remove any pip that doesn't fall into a given # range. The range can be set via command line or # by using a range file. ######################################################################### sub restrictPipsToRange { my $min = shift; my $max = shift; my $range_file = shift; my $species = shift; my @all_pips = @_; # If no range has been given, then just return the orginal pip array if (($min == 0) && ($max == 99999999) && ($range_file eq "")) { debug_print("No Range restriction\n"); return @all_pips; } # Range exists, now remove pips that are not in range my @selected_pips; my @range_list; # Create an 2D-array of all ranges, containing the Min and Max if (($min > 0) || ($max < 99999999)) { push @range_list, [$min, $max]; } if ($range_file ne "") { open(RANGE_FILE, "<$range_file"); while() { # Comment lines start with a #, and must be first character. if (substr($_,0,1) ne "#") { chomp; my @line = split(",",$_); # Make sure the min is less than max before adding to array if ($line[0] < $line[1]) { push @range_list, [$line[0], $line[1]]; } } } } # Range list created, now sort based upon min for easier lookup. @range_list = sort {$a->[0] <=> $b->[0]} @range_list; for (my $i=0; $i<@all_pips; $i++) { for (my $j=0; $j<@range_list; $j++) { $min = $range_list[$j][0]; $max = $range_list[$j][1]; if (($all_pips[$i]->{start} >= $min) && ($all_pips[$i]->{end} <= $max)) { push @selected_pips, $all_pips[$i]; last; } } } debug_print("Number of pips in Range [".(@selected_pips)."]\n"); return @selected_pips; } ######################################################################### # getPipsFromFile - This function will open a .maf file and will read in # all the alignments, creating non-gapped pips from # the alignments ######################################################################### sub getPipsFromFile { my $filename = shift; open(MAF, "<$filename"); my @alignments; my @pips; my %species_struct; my $align_idx = 0; $_ = ; # Skip the MAF File Header debug_print("Reading file in to pip struct\n"); my ($date, $time) = getExecutionTime(); debug_print("Started : $time\n"); while () # Read in the multiple alignments { if ($_ =~ m/^a /) # See if we found the start of the next alignment { my $species_idx = 0; while () # Read aligment to get species sequence { last if ($_ !~ /^s /); chomp; my @data = split(" ",$_); my $source = $data[1]; $species_struct{start} = $data[2]; $species_struct{size} = $data[3]; $species_struct{strand} = $data[4]; $species_struct{sequence} = $data[6]; my ($species, $build, $chromosome) = getSourceInfo($source); $species_struct{species} = $species; $species_struct{build} = $build; $species_struct{chromosome} = $chromosome; push @{$alignments[$align_idx]}, {%species_struct}; $species_idx++; } $align_idx++; } } my $alignment_count = @alignments; ($date, $time) = getExecutionTime(); debug_print("Time : $time\n"); debug_print("Alignments [$alignment_count] converting to pips\n"); convertAlignmentsToPips(\@pips, \@alignments); ($date, $time) = getExecutionTime(); debug_print("Pips Completed: $time\n"); return @pips; } ######################################################################### # convertAlignmentsToPips - This function will take N alignments and # convert each alignment to M non-gapped # alignments, called pips. All pips are # pushed on to the pip array so to generate # a probe from each pip. ######################################################################### sub convertAlignmentsToPips { my $pips = shift; my $alignments = shift; debug_print("Converting Alignments in to Pips\n"); for (my $i=0; $i<@{$alignments}; $i++) { #Determine number of species in alignment my $num_species = @{$alignments->[$i]}; my $start_pip = 0; my $end_pip = 0; my $alignment_idx = 0; my $alignment_length = length($alignments->[$i][0]->{sequence}); while ($alignment_idx < $alignment_length) { # Read through the alignment until you find the begining of a pip # which means that no gap or repeatitive sequence exists in any bp # for any species my $found_pip_start = 0; while (($alignment_idx < $alignment_length) && ($found_pip_start == 0)) { for (my $j=0; $j<$num_species; $j++) { my $bp = substr($alignments->[$i][$j]->{sequence},$alignment_idx,1); if ($bp =~ m/[ATGC]/) { $found_pip_start = 1; } else { $found_pip_start = 0; last; } } $alignment_idx++ if ($found_pip_start == 0); } $start_pip = $alignment_idx; #print "Pip starting at idx [$start_pip]\n"; # # Read the pip until you find a gap or repeatitive bp in any species # my $found_pip_end = 0; while (($alignment_idx < $alignment_length) && ($found_pip_end == 0)) { for (my $j=0; $j<$num_species; $j++) { my $bp = substr($alignments->[$i][$j]->{sequence},$alignment_idx,1); if ($bp =~ m/[^ATGC]/) { $found_pip_end = 1; last; } } $alignment_idx++ if ($found_pip_end == 0); } $end_pip = $alignment_idx; #print "Pip Ending at [$end_pip]\n"; # # Pip start and stop found, extract pip from all species sequence and # push on to pip array; # my $pip_length = $end_pip - $start_pip; if ($pip_length >= $args{probe_length}) { my %pip_struct; $pip_struct{start} = -1; $pip_struct{end} = -1; $pip_struct{target_species} = ""; for (my $j=0; $j<$num_species; $j++) { my %species_struct; my $num_gaps=0; if ($alignments->[$i][$j]->{species} eq $args{species}) { if ($start_pip > 0) { my $leading_seq = substr($alignments->[$i][$j]->{sequence},0,$start_pip); #print "Start pip at [$start_pip]\n"; #print "Start at [$start_pip]\n"; #print "Leading Seq [$leading_seq]\n"; ($num_gaps) = ($leading_seq =~ tr/-//); #print "Num Gaps : [$num_gaps]\n\n\n"; } # Add 1 to start position, since blat and blast are 1 based, while the maf alignment is 0 based. $pip_struct{start} = $alignments->[$i][$j]->{start} + $start_pip - $num_gaps + 1; $pip_struct{end} = $pip_struct{start} + $pip_length - 1; $pip_struct{strand} = $alignments->[$i][$j]->{strand}; $pip_struct{target_species} = $alignments->[$i][$j]->{species}; $pip_struct{target_sequence} = substr($alignments->[$i][$j]->{sequence},$start_pip, $pip_length); $pip_struct{build} = $alignments->[$i][$j]->{build}; $pip_struct{chromosome} = $alignments->[$i][$j]->{chromosome}; } else { $species_struct{species} = $alignments->[$i][$j]->{species}; $species_struct{strand} = $alignments->[$i][$j]->{strand}; $species_struct{sequence} = substr($alignments->[$i][$j]->{sequence},$start_pip, $pip_length); my $leading_seq = substr($alignments->[$i][$j]->{sequence},0,$start_pip-1); my ($num_gaps) = ($leading_seq =~ tr/-//); $species_struct{start} = $alignments->[$i][$j]->{start} + $start_pip - $num_gaps + 1; $species_struct{end} = $species_struct{start} + $pip_length - 1; $species_struct{build} = $alignments->[$i][$j]->{build}; $species_struct{chromosome} = $alignments->[$i][$j]->{chromosome}; push @{$pip_struct{species}}, \%species_struct; } } push @{$pips}, \%pip_struct; } } } my $pip_count = @{$pips}; debug_print("Pip count [$pip_count]\n"); return $pips; } ######################################################################### # getSourceInfo- This function takes the source field from a .maf file # and extracts the species, build and chromosome. ######################################################################### sub getSourceInfo { my $source = shift; my $species_abbr =""; my $species =""; my $build =""; my $chr =""; # The $source from the FAM file is expected in the following format: # # SSBB.chrXX # SS = 2 letter abbrivation for the species # BB = Build number from UCSC # XX = Chromosome number # ($species_abbr, $build, $chr) = ($source =~ m/^(\D+)(\d+)\.chr(.+)/); $species = $species_abbr; $species = "human" if ($species_abbr eq "hg"); $species = "mouse" if ($species_abbr eq "mm"); $species = "rat" if ($species_abbr eq "rn"); return ($species, $build, $chr); } sub debug_print { my $string = shift; if ($args{debug_file} ne "") { open (LOG,">>".$args{debug_file}); print LOG $string; close LOG; } else { print $string; } return; } ######################################################################### # getParams - gets all options from the command line or keeps defaults # dies if required options are not included ######################################################################### sub getParams { my %opts; my $errstr = ""; my $filename; use Getopt::Long; GetOptions(\%opts, 'help', 'debug_file=s', 'probefile=s', 'length=i', 'min=i', 'max=i', 'range_file=s', 'species=s', 'cutoff=f', 'tolerance=f', 'gc=f', 'overlap=i', 'scoring_file=s', 'best_probe', 'verbose'); if (defined($opts{help})) { $errstr = " "; } else { $filename = $ARGV[0]; if ( ! -e $filename ) { $errstr .= "ERROR : Input file not found.\n"; } else { # Check to see if header line in file is indicates # that the file is a .maf version 1 format. open ("FILE","<".$filename); $_ = ; close FILE; chomp; if ( $_ !~ m/^##maf .*version=1/ ) { $errstr .= "ERROR : Alignment File not a valid MAF version 1 file\n"; $errstr .= " First line must start with ##maf and contain version=1\n"; } } } if ($errstr) { print <<"END_USAGE"; MULTI_SOOP.PL Description: Generate probes from a multi-species alignment file (.maf) Usage: multi-soop.pl [-scoring ] -help Multi-speices alignment file. -length Number of basepairs in probe. (Default=36) -scoring Scoring Template File. END_USAGE print "\n$errstr" if ($errstr); die("\n"); } return ( file => $filename, probe_length => $opts{length}||36, min => $opts{min} || 0, max => $opts{max} || 99999999 , range_file => $opts{range_file} || "", species => $opts{species}, cutoff => $opts{cutoff} || 31.82, gc => $opts{gc} || .5, tolerance => $opts{tolerance} || .06, overlap => $opts{overlap} || 8 , probefile => $opts{probefile} || "probes.xml", best_probe => $opts{best_probe} || 0, debug_file => $opts{debug_file} || "", scoring_file => $opts{scoring_file} || "", verbose => $opts{verbose} || 0 ); } ############################################################################## # Start Description # Generate probes from a multi-species alignment file (.maf) using scoring parameters from configuration file. # End Description ############################################################################## ######################################################################### # END PROGRAM -- BEGIN POD DOCUMENTATION ######################################################################### =head1 NAME B - system for optimized overgo picking for multi species alignment =head1 SYNOPSIS S B<[options]> IMulti Species FileE>> =head1 DESCRIPTION Soop designs "overgo" DNA hybridization probes from a mutliple species sequence file. This basic idea hehind designing probes from a multi-species alignment is that orthologous sequences, between multiple species, that are highly concerved are likely to also be conserved in another species. Each base pair is given a score, the sum of the scores determines the probe score. The higher the score the more likely the probe will work in an unknown species. =over 4 =item MULTI-SPECIES FILE The format supported for multi-species alignment is the UCSC format. This format basically has a fasta like description header, followed by N species' sequence. =back =head1 OPTIONS =over 4 =item B<-score> I Because of the complexity of scoring a probe, all scoring parameters will be read from a scoring file. Below is the order of precedence for scoring values: 1. External Scoring File 2. Default External Scoring File (multi_soop.config) 3. Default Internal Scoring values =item B<-probe> I This is the name of the species that will be used to generate the probe sequence from. =item B<-g> I The ideal spacing between probes in kilobases. Soop trys to make the probes from ungapped alignments this far apart. Defaults to I<30>. =item B<-h> Print out this message (requires C in your path) and exit. (Overrides all other options) =item B<-i> I Proportion minimum identity for probes to be made. In other words if B is .80 no overgo will be made with less than 80% similarity between the two sequences. Defaults to I<.80>. =item B<-l> I Length in bases of overgo to design. If B is 36 then the overgos will be made of two primers who's final lengths post extension will be 36 base pairs. Defaults to I<36>. =item B<-n> I Prefix for names of primers in .stj (primers) file. If this option is present the primers will be named ##.1 and ##.2 for each primer with the ## incrementing by one from 00. If a primer file is written and this option is not specified the primer names will be the first "words" on the defline that match the character class [A-Za-z0-9_-.]. =item B<-o> I Overlap in bases between the two primers. Defaults to I<8> =item B<-p> I Name of file to write primers in (In "Give to Jackie" format). When making overgos from a FASTA formatted file this is the output filename. Not written by default. =item B<-r> Make probes/overgos from the "reference" or "subject" sequence. This does not affect which coordinates are used for setting probe spacing. They're always the reference/subject sequence (for now). By default the probes are made from the Query sequence. =item B<-s> I Start at this offset (in bases). This option isn't used very much, it describes an offset to start making probes at. In other words if you wanted to make some probes but skip the first 100kb of reference sequence you would set this to 100000. Deafults to I<0>. =item B<-S> I Write summery information about the probes chosen to this file. Not written by default. =item B<-t> I Target proportion GC for the overgo as a whole. Defaults to I<.5> =item B<-v> Print the version number and exit. (Overrides all other options except B<-h>) =item B<-w> I Wiggle room for the GC proportion. In other words the amount the GC content can vary from the ideal GC content; if B<-t> was set to .5 and B<-w> was set to .06 than the GC content of the overgo would have to be in the range of 44% to 56% GC. Defaults to I<.06> =back =head1 OUTPUT FILE FORMATS =over 4 =item overgo file The overgo files (those written by the B<-f> and B<-a> options) are written in FASTA Format. With the defline in the following format: =back > ### pip:[ref:(-) query:(-) \ id:] og:[id:% offset: diff:] =over 4 =item The defline of the query sequence if in a comparison mode (BLAST or PipMaker). The defline of the fasta entry if in FASTA mode. =item pip:[ref:(-) query:(-) id:<%id>%] The characteristics of the pip that this overgo was made from. Start and ending coordinates in the reference/subject sequence and start and end coordinates in the query sequence. If in FASTA mode the reference coordinates (ref:(#-#)) will be 0 to the length of the fasta entry. Either "ref" or "query" will be capitalized to indicate which sequence the overgo was made from. =item og:[id:<%id> offset: The characteristics of the overgo itself. <%id> is the percent identity of the overgo alone. is the number of bases from the first base of the ungapped alignment or contig that the overgo starts. =item diff: This is a sequence comparison between the sequence of the overgo and the other sequence. From this you can retrieve either the overgo sequence or the sequence it was compared to. The format is the overgo sequence with a > after any base that changes where _ indicates a gap on either side. For example the following aligned sequences: ACTG TCAGA || |-:|-|| ACGGTCC GA would be represented "ACT>GG_>TCA>_GA" The format can be converted to the sequence of the overgo with the regular expression s/\>[ACTG]|_//g or to the sequence it was compared to with the regular expression s/[ACTG]\>|_//g. =item primer file Format of the primer file is simply the name of the primer then the sequence of that primer. Primer names are the overgo name with a .1 or .2 appended depending whcih strand the oligo is from. For example 4href03.1 GCTGGGTGTGCCTGGGGATCAT 4href03.2 GTTTTAACAATAAAATGATCCC =back =head1 USAGE This is a brief summary of how we use B. =over 4 =item 1 The first thing to do is to put together a "finished" ungapped reference sequence that will be used as a basis for the comparison and location estimation for other species. This should be your best estimate of the true finished sequence for the reference organism (human in our case). =item 2 Get comparison sequence from another species. We have used both finished mouse sequence from BACs that align to our reference sequence as well as now we are using mouse whole genome shotgun traces downloaded from Ensembl's web site (http://www.ensembl.org) that align to golden path coordinates similar to ours. =item 3 Mask the repeats from both sequences. We use RepeatMasker (http://repeatmasker.genome.washington.edu/cgi-bin/RepeatMasker) to mask both the reference and the other species sequences. =item 4 Run an NCBI blast (ftp://ftp.ncbi.nlm.nih.gov/blast/executables/) or PipMaker (http://bio.cse.psu.edu/pipmaker/) with a database of the reference sequence and a query of the other species. If you run PipMaker You'll need the Verbose text output (The one the web site says you don't want or need). =item 5 Finally we get to running B. You'll need to run B using either the BLAST2 standard text output or PipMaker verbose file. If you just want the default parameters B can be run as just C I>. =item 6 You I then re-blast the overgos file that B produces against appropriate databases to insure that it hasn't designed low complexity or repetitive probes. You should make sure that all high value hits make sense as coming from one position in a given genome. If you have to drop probes at this point you can use the B<-a> option to find nearby replacements. =back =head1 FILES F is necessary for parsing NCBI blast 2.0 output. It's available at http://genome.nhgri.nih.gov/blastall =head1 VERSION soop $Revision: 2.11 $ $Date: 2002/12/20 04:21:31 $ UTC =head1 BUGS Files with long stretches of NNNNNN's that span more than one line cause the PipMaker text file parser (parseVerbose) to barf and give incorrect data (subject becomes query). The "reference sequence" is used to determine overgo spacing. Currently there is no way to use the query sequence coordinates. Will overwrite input file in FASTA mode without asking permission. This is especially insidious when the input filename is "overgos". (note to self: should be fixed in getParams()) Should have offset (B<-s>) or "start at" option for FASTA mode. =head1 AUTHOR Written by Arjun Prasad (aprasad@nhgri.nih.gov) Some ideas taken from Joe Ryan (Blastall.pm) and John D. McPherson (overgo GC content optimization) =head1 SEE ALSO =over 4 =item RepeatMasker http://repeatmasker.genome.washington.edu/cgi-bin/RepeatMasker =item PipMaker http://bio.cse.psu.edu/pipmaker/ =item NCBI Blast 2 http://www.ncbi.nlm.nih.gov/BLAST/ Stand alone blast (recommended): ftp://ftp.ncbi.nlm.nih.gov/blast/executables/ =item Ensembl http://www.ensembl.org =back =head1 COPYRIGHT PUBLIC DOMAIN NOTICE This software/database is ``United States Government Work'' under the terms of the United States Copyright Act. It was written as part of the authors' official duties for the United States Government and thus cannot be copyrighted. This software/database is freely available to the public for use without a copyright notice. Restrictions cannot be placed on its present or future use. Although all reasonable efforts have been taken to ensure the accuracy and reliability of the software and data, the National Human Genome Research Institute (NHGRI) and the U.S. Government does not and cannot warrant the performance or results that may be obtained by using this software or data. NHGRI and the U.S. Government disclaims all warranties as to performance, merchantability or fitness for any particular purpose. In any work or product derived from this material, proper attribution of the authors as the source of the software or data should be made, using: I as the citation. =cut