#!/usr/bin/perl use strict; use Data::Dumper; my %params = getParams(); opendir(DIR,"."); my @validated_files; my $status; while (my $file = readdir(DIR)) { if ($file =~ m/^chr.{1,2}\.xml\.\d+\.valid/) { if ($params{chr} eq "0") { $status = "Complete Genome"; push @validated_files, $file; } else { $status = "Chr".$params{chr}; push @validated_files, $file if ($file =~ m/chr$params{chr}\./); } } } print "Processing : $status\n"; @validated_files = sort @validated_files; my $last_chr = ""; my $current_chr = ""; my %probes; for (my $i=0; $i<@validated_files; $i++) { ($current_chr) = ($validated_files[$i] =~ m/^chr(.+?)\./); if ($current_chr ne $last_chr) { print "\nProcessing Chr$current_chr\n"; if ($last_chr ne "") { print UNIQUE "\n\n"; print NON_UNIQUE "\n\n"; close UNIQUE; close NON_UNIQUE; } $last_chr = $current_chr; my $chr_file = "chr$current_chr.xml"; %probes = index_file($chr_file, "human"); open(PROBES, "<$chr_file"); open(UNIQUE,">>chr$current_chr.probes.xml.valid"); print UNIQUE "\n\n"; print UNIQUE "\n\n"; open(NON_UNIQUE,">>chr$current_chr.probes.xml.invalid"); print NON_UNIQUE "\n\n"; print NON_UNIQUE "\n\n"; } print "Processing --> $validated_files[$i]\n"; open(FILE, "<".$validated_files[$i]); my $tmp_ctr = 0; my $mask_ctr = 0; my $species = ""; my $probe_cnt=0; while() { my $result; my $header; my $start_bp=0; my $end_bp = 0; if (($_ =~ tr/ //) == 1) { ($result, $header) = ($_ =~ m/^(.+) (.+)$/); } else { ($result, $header, $start_bp, $end_bp) = ($_ =~ m/^(.+) +(.+) +(\d+) +(\d+)$/); } if ($species eq "") { ($species) = ($header =~ m/^(.+?),/); } if (exists($probes{$header})) { if ($result eq "VALID") { printProbe(\*UNIQUE, \*PROBES, $probes{$header}, $species, $start_bp, $end_bp); } elsif ($result ne "NOT_FOUND") { printProbe(\*NON_UNIQUE, \*PROBES, $probes{$header}, $species, $start_bp, $end_bp); } } } close FILE; } sub printProbe { local *OUTFILE = shift; local *INFILE = shift; my $position = shift; my $species = shift; my $start_bp = shift; my $end_bp = shift; seek INFILE, $position, 0; my $rc = fileno(OUTFILE); #print "RC [$rc]\n"; my $found_species = 0; while (($_ = ) !~ m/<\/probe>/) { $found_species = 1 if ($_ =~ m/$species<\/name>/); if ($found_species == 1) { my ($tag) = ($_ =~ m/^.+?<(.+?)>/); if (($tag eq "probe_start") && ($start_bp > 0)) { print OUTFILE " $start_bp\n"; } elsif (($tag eq "probe_end") && ($end_bp > 0)) { print OUTFILE " $end_bp\n"; $found_species = 0; } else { print OUTFILE $_; } } else { print OUTFILE $_; } } print OUTFILE $_; print OUTFILE "\n"; return; } sub index_file { my $file = shift; my $species = shift; my %lookup; print "Indexing file [$file]\n"; open (FILE,"<".$file); while () { my $header = ""; if ($_ =~ m//) { my $fp = tell FILE; $fp = $fp - length($_); while (($_= ) !~ m/<\/probe>/) { if ($_ =~ m/$species/) { $header = $species; $_ = ; $_ = ; $_ = ; my ($chromosome) = ($_ =~ m/chromosome>(.+); my ($start) = ($_ =~ m/start>(.+); my ($end) = ($_ =~ m/end>(.+) $opts{chr} || "0" ); } ############################################################################## # Start Description # For one chromosome, match validation results with full XML probe information to produce two output file, one limited to unique probes and the other to non-unique probes. # # # End Description ##############################################################################