#!/usr/bin/perl use strict; require XML::Simple; use subs ; use Data::Dumper; use File::Basename; use Benchmark; # This script will take a XML file of probes, and look up the location in the human # the location of the probes. my %opt = getParams(); my $in1= new XML::Simple(); my $all_probes = $in1->XMLin($opt{probe_file}, keyattr => {}); my $filecount = 1; open (OUTPUT, ">".$opt{output_file} . "." . $filecount); #print Dumper(\$all_probes); my $count = 0; for (my $i=0; $i<@{$all_probes->{probe}}; $i++) { if ($count >= $opt{max_probes}) { close (OUTPUT); $filecount++; $count = 0; open (OUTPUT, ">".$opt{output_file} . "." . $filecount); } my $def_line = ""; my $idx = $opt{species_idx}; $def_line .= $all_probes->{probe}->[$i]->{name} if (ref($all_probes->{probe}->[$i]->{name}) ne "HASH"); $def_line .= $all_probes->{probe}->[$i]->{species}->[$idx]->{name} if (ref($all_probes->{probe}->[$i]->{species}->[$idx]->{name}) ne "HASH"); $def_line .= "," . $all_probes->{probe}->[$i]->{species}->[$idx]->{chromosome} if (ref($all_probes->{probe}->[$i]->{species}->[$idx]->{chromosome}) ne "HASH"); $def_line .= "," . $all_probes->{probe}->[$i]->{species}->[$idx]->{probe_start} if (ref($all_probes->{probe}->[$i]->{species}->[$idx]->{probe_start}) ne "HASH"); $def_line .= "," . $all_probes->{probe}->[$i]->{species}->[$idx]->{probe_end} if (ref($all_probes->{probe}->[$i]->{species}->[$idx]->{probe_end}) ne "HASH"); print OUTPUT ">$def_line \n"; print OUTPUT $all_probes->{probe}->[$i]->{species}->[$idx]->{sequence} . "\n\n"; $count++; } ######################################################################### # getParams - gets all options from the command line or keeps defaults # dies if required options are not included ######################################################################### sub getParams { my %opts; my $errstr = ""; use Getopt::Long; use Data::Dumper; GetOptions(\%opts, 'file=s','output=s', 'count=i', 'species_idx=i'); if (!defined($opts{file})) { $errstr .= "ERROR : No probes sources defined.\n"; } if (!defined($opts{file})) { $errstr .= "ERROR : No output file specified.\n"; } if ($errstr) { print <<"END_USAGE"; Description: This script will take a XML probe file, and create a FASTA formated file that is ready for megablast. The scripts outputs several files with the same basename, each file containing N probes (default 3000) Usage: create_megablast_file.pl [-count ] -file -output -species_idx Probe Sources: -file Probe file to split in to megablast ready batches. -output Basename for the output file (extension will be .1-n) -count Number of probes per file -species_idx END_USAGE print "\n$errstr" if ($errstr); die("\n"); } return ( probe_file => $opts{file}, output_file => $opts{output}, max_probes => $opts{count} || 99999999, species_idx => $opts{species_idx} || 0 ); } ############################################################################## # Start Description # From one XML probe file prepare one or more FASTA format files ready for megablast. Each FASTA file will contain maximum of N probes (default 3000). # End Description ##############################################################################