#!/usr/bin/perl use strict; use Data::Dumper; use DBI; use subs ; my %opts = getParams(); #my $file = "/var/www/html/upload/".$opts{file}; my $file = $opts{file}; my $db = DBI->connect("DBI:mysql:database=yourDbName;host=localhost","yourDbId","yourDbPassword"); my %species_list = initialize_species_id(); my $version_id = $opts{version}; print "Version [$version_id]\n"; my $file1_id = 0; my $file1_name = ""; my $file2_id = 0; my $file2_name = ""; my @idx_array; print "[$file]\n"; my $alignment_id = -1; open XML_FILE, "<$file"; my $fp = -1; # Read file and sort File Offset in ascending order while () { if ($_ =~ m//) { $fp = tell XML_FILE; $fp = $fp - length($_); } elsif ($_ =~ m//) { my ($start) = ($_ =~ m/probe_start>(.+)<\/probe/); push @idx_array, [$start,$fp] if ($fp >= 0); $fp = -1; } } @idx_array = sort {$a->[0] <=> $b->[0]} @idx_array; print "Adding probes to probes table\n"; my $probe_count=0; for (my $i=0; $i<@idx_array; $i++) { if ($alignment_id <= 0) { # Insert alignment information my $query = $db->prepare("select ifnull(max(alignment_id),0) + 1 as next_id from alignments"); $query->execute; my $row = $query->fetchrow_hashref(); $alignment_id = $row->{next_id}; } else { $alignment_id++; } my $fp = $idx_array[$i][1]; seek XML_FILE, $fp, 0; $_ = ; my ($probe_score) = ($_ =~ m/probe_identity>(.+)prepare("$sql"); #$query->execute; my $last_source_name = "EMPTY"; # Loop through the species alignments and add sequence to database while (($_ = ) !~ m/<\/probe>/) { my $sequence; my $start_bp; my $end_bp; my $chromosome; my $strand; my $source_file_name; my $species; my $source_file_id; my $species_id; my $source_species; my $public; my $parent_alignment_start; my $parent_alignment_end; $strand = 0; while (($_ = ) !~ m/<\/species>/) { ($sequence) = ($_ =~ m/>(.+)<\//) if ($_ =~ m/sequence/); ($start_bp) = ($_ =~ m/>(.+)<\//) if ($_ =~ m/probe_start/); ($end_bp) = ($_ =~ m/>(.+)<\//) if ($_ =~ m/probe_end/); ($chromosome) = ($_ =~ m/>(.+)<\//) if ($_ =~ m/chromosome/); ($strand) = ($_ =~ m/>(.+)<\//) if ($_ =~ m/strand/); ($source_file_name) = ($_ =~ m/>(.+)<\//) if ($_ =~ m/file/); ($species) = ($_ =~ m/>(.+)<\//) if ($_ =~ m/name/); $species = uc($species); if ($source_file_name ne $last_source_name) { $source_file_id=0; my $query = $db->prepare("select file_id from source_files where filename = \"$source_file_name\""); $query->execute; if ($query->rows == 1) { my $row = $query->fetchrow_hashref(); $source_file_id = $row->{'file_id'}; } $last_source_name = $source_file_name; } } $species_id = $species_list{$species}; if ($species_id == 1) { $chromosome = convertChromosome("chr".$chromosome); my ($primer1, $primer2) = primerMaker($sequence, 8); if ($species eq $source_species) { $public = 1; } else { $public = 0; } my $probe_type = "x"; my $db_probe_type = 1; if ($opts{type} eq "VALID") { $probe_type = "u"; $db_probe_type = 2; } my $gc_score = getGCscore($sequence); $gc_score = $gc_score * 100; if ($gc_score < 55.36) { $probe_count++; my $name = $probe_type."_hsa_".$chromosome."_".$version_id."_".$probe_count; my $length = length($sequence); my $sql = "insert into probes "; $sql .= "(sequence, version_id, start_bp, stop_bp, chromosome, strand, public, "; $sql .= "species_id, name, alignment_id, file_id, length, primer1, primer2, "; $sql .= "parent_alignment_start, parent_alignment_stop, score, type, gc_score) values "; $sql .= "(\"$sequence\", $version_id, $start_bp, $end_bp, $chromosome, $strand, $public, "; $sql .= "$species_id, \"$name\", $alignment_id, $file1_id, $length, \"$primer1\", \"$primer2\","; $sql .= "0 , 0 , $probe_score, $db_probe_type , $gc_score)"; $query = $db->prepare("$sql"); $query->execute; } } } } ######################################################################### # getGCscore -- gets an OvergoMaker40 style GC content score for an # overgo. # # idea for algorithm from overgomaker40 by John D. McPherson at Washington # University, St. Louis # # INPUTS: $overgo - sequence of overgo # $overlap - overlap between complimentary probes # $target_gc - target GC proportion (usually .5) # OUTPUTS: returns GC score, lower is better ######################################################################### sub getGCscore { my $overgo = uc shift; my $overlap = 8; my $target_gc = .5; my $overgo_len = length($overgo); my $unique_len = $overgo_len / 2 - $overlap / 2; my $seq = substr($overgo, 0, $unique_len); my $score = abs($target_gc - ($seq =~ tr/GC//) / $unique_len); $seq = substr($overgo, $unique_len, $overlap); $score += abs($target_gc - ($seq =~ tr/GC//) / $overlap); $seq = substr($overgo, -$unique_len); $score += abs($target_gc - ($seq =~ tr/GC//) / $unique_len); # GC content of the edges of the overlap region is better if it's a G # or a C, but this should be less important than the GC content of # the three sections as added above. # K&R says that a float should have at least 6 significant digits. # (Damn, that was a long time ago!) $score -= ( (substr($overgo, $unique_len, 1) =~ tr/GC//) + (substr($overgo, -$unique_len-1, 1) =~ tr/GC//)) * 1E-6; return $score; } sub initialize_species_id { my $query = $db->prepare("select * from species"); my %species; $query->execute; while (my $row = $query->fetchrow_hashref()) { my $name = uc($row->{species}); $species{$name} = $row->{species_id}; } return %species } sub primerMaker { my($sequence) = uc shift; my($overlap) = shift; my($primer_length) = length($sequence) / 2 + $overlap / 2; my($template_primer) = substr($sequence, 0, $primer_length); # grab the end of the sequence and reverse it my($rev_comp_primer) = join('',reverse(split(//, substr($sequence, -$primer_length)))); # now convert to complimentary bases $rev_comp_primer =~ tr/ATCG/TAGC/; return ($template_primer, $rev_comp_primer); } sub convertChromosome { my $rc; my $chr = uc shift; $rc = 0; if ($chr eq "CHR1") { $rc = 1; } if ($chr eq "CHR2") { $rc = 2; } if ($chr eq "CHR3") { $rc = 3; } if ($chr eq "CHR4") { $rc = 4; } if ($chr eq "CHR5") { $rc = 5; } if ($chr eq "CHR6") { $rc = 6; } if ($chr eq "CHR7") { $rc = 7; } if ($chr eq "CHR8") { $rc = 8; } if ($chr eq "CHR9") { $rc = 9; } if ($chr eq "CHR10") { $rc = 10; } if ($chr eq "CHR11") { $rc = 11; } if ($chr eq "CHR12") { $rc = 12; } if ($chr eq "CHR13") { $rc = 13; } if ($chr eq "CHR14") { $rc = 14; } if ($chr eq "CHR15") { $rc = 15; } if ($chr eq "CHR16") { $rc = 16; } if ($chr eq "CHR17") { $rc = 17; } if ($chr eq "CHR18") { $rc = 18; } if ($chr eq "CHR19") { $rc = 19; } if ($chr eq "CHR20") { $rc = 20; } if ($chr eq "CHR21") { $rc = 21; } if ($chr eq "CHR22") { $rc = 22; } if ($chr eq "CHRX") { $rc = 23; } if ($chr eq "CHRY") { $rc = 24; } return $rc; } sub getParams { my %opts; my $filetype; use Getopt::Long; GetOptions(\%opts, 'file=s', 'version=i', 'type=s'); return ( file => $opts{file}, version => $opts{version}, type => uc($opts{type}) ); } ############################################################################## # Start Description # Insert probes from file into database. # End Description ##############################################################################