#!/usr/bin/perl ######################################################################### # # scrubber.pl # # Purpose: # # Soop.pl uses several formats to generate the list of potential probes. # Since the source data set can contain repeative sequence from either # the target or query species, Soop may create probes that contain # repeative sequence. This program is designed to "scrub" the sequence # file and replace any repeative sequence, denoted by lower case, with # X's. Soop will ignore any X's, basically treating it like a gap. # # There are currently 2 file formats supported, fasta and the PipMaker # (verbose) formats. The scrubed file will be printed to STDOUT, and # will should be redirected to the destination file. # # Options: # # This script takes 2 required arguments. The first arg is the filename # of the source file. The second arg is the type of file, this can be # set to either "fasta" or "verbose". # # # Version 1.0 # Developed by : Brian Carlson # Date : April 2, 2003 # ######################################################################### use strict; use subs ; ######################################################################### # # MAIN # ######################################################################### my %options = getParams(); scrub_fasta($options{file}) if ($options{type} eq "FASTA"); scrub_verbose($options{file}) if ($options{type} eq "VERBOSE"); ######################################################################### # scrub_fasta - This will take a fasta file and replace repeative # sequence with X's ######################################################################### sub scrub_fasta { my $filename = shift; my $scrubed_line; open (FILE, $filename); while () { if ($_ =~ tr/^>//) { print $_; } else { $_ =~ tr/agtc/XXXX/; print $_; } } } ######################################################################### # scrub_verbose - This will take a fasta file and replace repeative # sequence with X's ######################################################################### sub scrub_verbose { my $filename = shift; my $scrubed_line; open (FILE, $filename); while () { if ($_ =~ m/^ *\d+ [ATGCatgc ]+$/) { $_ =~ tr/agtc/XXXX/; print $_; } else { print $_; } } } ######################################################################### # getParams - gets all options from the command line or keeps defaults # dies if required options are not included ######################################################################### sub getParams { my %opts; my $errstr = ""; use Getopt::Long; GetOptions(\%opts, 'file=s', 'type=s'); if (!defined($opts{file})) { $errstr .= "ERROR : No Source file defined.\n"; } else { if ( ! -e $opts{file}) { $errstr .= "ERROR : [". $opts{file} . "] file doesn't exist\n"; } } if (defined($opts{type})) { if ((uc($opts{type}) ne "FASTA") && (uc($opts{type}) ne "VERBOSE")) { $errstr .= "ERROR [".$opts{type}."] not a supported type of fasta or verbose\n"; } } else { $errstr .= "ERROR : No file type specified.\n"; } if ($errstr) { print <<"END_USAGE"; Description: This program will take either a Fasta formated file, or a PipMaker output file, and will replace the repeative sequence (lower case) with X's. This is done so that when Soop is ran, it will ignore the repeative seuqnce when generating probes. Usage: scrubber.pl [options] Options: -file File to have repeative sequence replaced with X's. -type [ fasta | verbose ] File type to be scrubed, either fasta or verbose -debug [logfile] Print debug messages to log file (Default: hybridize.log) END_USAGE print "\n$errstr" if ($errstr); die("\n"); } return ( file => $opts{file}, type => uc($opts{type})); }