#!/usr/bin/perl # This program attempts to select a near-minimal set of tagSNPs that capture # all of the ldSelect bins in N populations, N>=1. ############################################################################################# #/-----------------------------------------------------------------------------------------\# #| |# #| Program: multiPopTagSelect |# #| Version: 1.1 |# #| Copyright (C) 2005-2006 |# #| by Bryan N. Howie, Christopher S. Carlson, Mark J. Rieder, and Deborah A. Nickerson |# #| University of Washington |# #| |# #| |# #| All rights reserved. |# #| |# #| You have the permission to use and develop multiPopTagSelect.pl ("the software"), |# #| provided that the following conditions are met: |# #| |# #| 1. The software is not published, distributed, or otherwise transferred or made |# #| available to others. |# #| |# #| 2. If utilization of the software results in outcomes which will be published, please |# #| specify the version of the software you used and cite attributions noted above. |# #| |# #| 3. You acknowledge that University of Washington ("UW") and the UW developers may |# #| develop modifications to the software that may be substantially similar to your |# #| modifications of the software and that UW and UW developers shall not be |# #| constrained in any way by you in UW's and UW developers' use or management of |# #| such modifications. You acknowledge the right of the UW and UW developers to |# #| prepare and publish modifications to the software that may be substantially |# #| similar or functionally equivalent to your modifications and improvements, and if |# #| you obtain patent protection for any modification or improvement to the software, |# #| you agree not to allege or enjoin infringement of your patent by the UW and UW |# #| developers. |# #| |# #| This software is provided "AS IS" and any express or implied warranties, including, |# #| but not limited to, the implied warranties of merchantability and fitness for a |# #| particular purpose are disclaimed. In particular, this disclaimer applies to any |# #| diagnostic purpose. In no event shall the authors or the University of Washington be |# #| liable for any direct, indirect, incidental, special, exemplary, or consequential |# #| damages (including, but not limited to, procurement of substitute goods or services; |# #| loss of use, data, or profits; or business interruption) however caused and on any |# #| theory of liability, whether in contract, strict liability, or tort (including |# #| negligence or otherwise) arising in any way out of the use of this software, even if |# #| advised of the possibility of such damage. |# #| |# #\-----------------------------------------------------------------------------------------/# ############################################################################################# use strict; use warnings; # primary algorithm functions sub ReadContextFile($); sub ReadSiteScoresFile($); sub ReadExcludedSNPsFile($); sub ReadRequiredSNPsFile($); sub FindDistinctTagSNPs($\@\%); sub StoreContextInfo(\$); sub InitializeSNPInfo(\%$$$); sub ChooseMaximallyInformativeSNPs(\@\@\@); sub SortSitesByBinContent(\@); sub AreBinLocationsEquivalent(\@\@); sub ChooseRepresentativeSNP(\@); sub CompileRankedList(\@\@); sub SendSNPToListSegment($); sub SortArrayBySiteScore(\@); sub RandomizeList(\@); sub FindMinimalTagSNPSet(\@\@\@); sub DeepCopy(\@); sub FillCoveredBinsRepository(\@\@); sub IsSNPExpendable(\@$); sub IsElementInArray(\@$); sub ScoreTagSNP($); sub SortSelectedSNPs(\@); sub PrintResults(\@); sub ParseCommandLine(); # functions associated with finding a provably minimal tagSNP set sub AreDiscardedSNPsOptimal(\@); sub IsSNPEverExpendable(\@$); sub DoesBinCoverageOverlap($$); sub AreArraysIdentical(\@\@); sub CreateSNPRemovalTemplates(\@\%\%); sub AssemblePatternArrays(\@); sub ReplicatePatternArrays(\@$); sub AreTemplatesRelevant(\@\@); sub DoResidualSNPsAccountForAllBins(\@\@); ParseCommandLine(); # command line variables my %arg; my $ld_select_files = $arg{-ld_select}; # file containing a list of ldSelect output files my $context = $arg{-context}; # file containing genomic and sequence context info my $scores = $arg{-scores}; # file containing scores for all sites my $excluded = $arg{-excluded}; # file containing a list of SNPs that must not be selected my $required = $arg{-required}; # file containing a list of SNPs that must be selected my $optimal = $arg{-optimal}; # seek a provably optimal solution? (y or n) my $verbose = $arg{-verbose}; # show program progress? (y or n) # global variables my $line; my $num_pops = 0; my (%tag_snp_info, %site_scores, %context_info, %discarded_snp_info); my (%excluded_snps, %required_snps, @required_other_snps); my (@representative_snps, @redundant_snps, @discarded_snps, @ranked_snps); my (@distinct_tag_snps, @minimal_tag_snp_set); my (@RF3_snps, @RI_snps, @RF5_snps, @RU_snps, @RS_snps, @RN_snps, @RT_snps); my (@UF3_snps, @UI_snps, @UF5_snps, @UU_snps, @US_snps, @UN_snps, @UT_snps); my @representative_snp_bin_repository; # file paths and temporary file names my @ldSelect_outputs = (); # ldSelect output files MAIN: { # parse genomic and sequence context file, if provided ReadContextFile($context) if ($context ne ""); # parse site scores file, if provided ReadSiteScoresFile($scores) if ($scores ne ""); # parse file of SNPs that must NOT be selected, if provided ReadExcludedSNPsFile($excluded) if ($excluded ne ""); # parse file of SNPs that must be selected, if provided ReadRequiredSNPsFile($required) if ($required ne ""); # parse ldSelect files and catalogue every distinct tagSNP FindDistinctTagSNPs($ld_select_files, @distinct_tag_snps, %tag_snp_info); # Algorithm Step 1: assign tagSNPs to mutually exclusive clusters and choose a # single "maximally informative" tagSNP to represent each cluster ChooseMaximallyInformativeSNPs(@distinct_tag_snps, @redundant_snps, @representative_snps); # Algorithm Step 2a: assemble maximally informative tagSNPs into a list, either in # random order or ranked by some desirable quality (like genomic sequence context) CompileRankedList(@representative_snps, @ranked_snps); # Algorithm Step 2b: traverse the tagSNP list, removing each SNP in turn. If no bin # in any population loses representation, discard the SNP; otherwise, return it to # the list. FindMinimalTagSNPSet(@ranked_snps, @discarded_snps, @minimal_tag_snp_set); # if the -optimal flag was activated, check to see whether the current # solution is truly minimal; if not, replace it with an optimal solution if ($optimal eq 'Y') { my $initial_solution = scalar(@minimal_tag_snp_set); # check optimality of current solution and replace if necessary if (!AreDiscardedSNPsOptimal(@minimal_tag_snp_set)) { my $improved_solution = scalar(@minimal_tag_snp_set); print STDERR "\nNon-optimal solution found: set reduced from"; print STDERR " $initial_solution SNPs to $improved_solution SNPs.\n\n"; } } # add any required but non-tagging SNPs to the final set push @minimal_tag_snp_set, @required_other_snps; # sort SNPs alphabetically or numerically, as appropriate SortSelectedSNPs(@minimal_tag_snp_set); # print program output PrintResults(@minimal_tag_snp_set); } ################################################################### # FUNCTIONS # ################################################################### # parse genomic and sequence context file sub ReadContextFile($) { my $context_file = shift; open(CONTEXT_FILE, "$context_file") || die $!; while (defined ($line = )) { my ($site, $genomic_context, $sequence_context) = split /\s+/, $line; # assign identifier for genomic context if ($genomic_context =~ /flanking|Flanking/) { if ($genomic_context =~ "5'-") { $context_info{$site}->{location} = '5-prime'; } elsif ($genomic_context =~ "3'-") { $context_info{$site}->{location} = '3-prime'; } $genomic_context = 'F'; } elsif ($genomic_context =~ /utr|UTR/) { if ($genomic_context =~ "5'-") { $context_info{$site}->{location} = '5-prime'; } elsif ($genomic_context =~ "3'-") { $context_info{$site}->{location} = '3-prime'; } $genomic_context = 'U'; } elsif ($genomic_context =~ /intron|Intron/) { $genomic_context = 'I'; } elsif ($genomic_context =~ /nonsyn|Nonsyn/) { $genomic_context = 'N'; } elsif ($genomic_context =~ /^synon|^Synon/) { $genomic_context = 'S'; } elsif ($genomic_context =~ /frame-shift|Frame-Shift/) { $genomic_context = 'T'; } else { $genomic_context = 'X'; } # assign identifier for sequence context if ($sequence_context =~ /repeat|Repeat/) { $sequence_context = 'R'; } elsif ($sequence_context =~ /unique|Unique/) { $sequence_context = 'U'; } else { $sequence_context = 'X'; } $context_info{$site}->{sequence} = $sequence_context; $context_info{$site}->{genomic} = $genomic_context; } close(CONTEXT_FILE); } # parse site scores file sub ReadSiteScoresFile($) { my $scores_file = shift; open(SCORES_FILE, "$scores_file") || die $!; while (defined ($line = )) { my ($site, $score) = split /\s+/, $line; $site_scores{$site} = $score; } close(SCORES_FILE); } # parse file of SNPs that must NOT be selected sub ReadExcludedSNPsFile($) { my $excluded_snps_file = shift; open(EXCLUDED_SNPS, "$excluded_snps_file") || die $!; while (defined ($line = )) { my $snp_id = $line; chomp $snp_id; $excluded_snps{$snp_id} = 1; } close(EXCLUDED_SNPS); } # parse file of SNPs that must be selected sub ReadRequiredSNPsFile($) { my $required_snps_file = shift; open(REQUIRED_SNPS, "$required_snps_file") || die $!; while (defined ($line = )) { my $snp_id = $line; chomp $snp_id; $required_snps{$snp_id} = 1; } close(REQUIRED_SNPS); } # parse ldSelect files and catalogue every distinct tagSNP (UNLESS it has been # designated for exclusion); assign each distinct tagSNP a vector detailing the # bins it tags in each population sub FindDistinctTagSNPs($\@\%) { my ($ldSelect_files, $distinct_tagSNPs, $tagSNP_info) = @_; my %has_snp_been_seen; my %required_tags; # keeps track of which required SNPs are also tagSNPs open(LD_FILE_NAMES, "$ldSelect_files") || die $!; while (defined ($line = )) { chomp $line; push @ldSelect_outputs, $line; $num_pops++; } close(LD_FILE_NAMES); # find every tagSNP that is represented in at least one ldSelect output file for my $i (0..$#ldSelect_outputs) { open(LD_SELECT, "$ldSelect_outputs[$i]") || die $!; my $line_counter = 0; my $current_bin = 0; while (defined ($line = )) { $line_counter++; if ($line_counter == 1) { my @line_info = split /\s+/, $line; $current_bin = $line_info[1]; } elsif ($line_counter == 2) { my @line_info = split /\s+/, $line; my @tag_snps; @tag_snps = @line_info[3..$#line_info] if (scalar(@line_info) >= 4); # check for new tagSNPs foreach my $snp (@tag_snps) { # if the third character is a hyphen, assume that this SNP # was provided with genomic and sequence context information if (length($snp) > 3 && substr($snp, 2, 1) eq '-') { # remove context info from SNP and store in %context_info StoreContextInfo($snp); } next if (exists($excluded_snps{$snp})); # if this SNP has not been observed as a tagSNP before, initialize it if (!exists($has_snp_been_seen{$snp})) { $has_snp_been_seen{$snp} = 1; InitializeSNPInfo(%$tagSNP_info, $snp, $i, $current_bin); push @distinct_tag_snps, $snp; if (exists($required_snps{$snp})) { $required_tags{$snp} = 1; } } else # this SNP has already bin noted as a tagSNP, so just update its info { $$tagSNP_info{$snp}->{bin_locations}->[$i] = $current_bin; $$tagSNP_info{$snp}->{num_bins_tagged}++; } } } elsif ($line_counter == 4) { $line_counter = 0; # reset line counter for the next bin } } close(LD_SELECT); } # store any required SNPs that were not designated as tagSNPs in a special array foreach my $required_snp (keys %required_snps) { if (!exists($required_tags{$required_snp})) { push @required_other_snps, $required_snp; my @bin_locations = (); for my $i (0..$num_pops-1) { push @bin_locations, 0; } $$tagSNP_info{$required_snp}->{bin_locations} = \@bin_locations; } } } # remove context info from SNP and store in %context_info hash sub StoreContextInfo(\$) { my $snp = shift; my $sequence_context = substr($$snp, 0, 1); my $genomic_context = substr($$snp, 1, 1); $$snp = substr($$snp, 3, length($$snp)-3); if (exists($context_info{$$snp})) { # report an error and quit if this context info conflicts with previous info if ($context_info{$$snp}->{sequence} ne $sequence_context || $context_info{$$snp}->{genomic} ne $genomic_context) { print STDERR "\nDiscrepancy between ldSelect file and sequence "; print STDERR "context file for SNP $$snp.\nExiting now.\n\n"; exit; } } else { $context_info{$$snp}->{sequence} = $sequence_context; $context_info{$$snp}->{genomic} = $genomic_context; } } # initialize a hash containing important information about each tagSNP sub InitializeSNPInfo(\%$$$) { my ($tagSNP_info, $snp, $tag_index, $first_bin) = @_; # this array holds the bins that the current SNP tags in each population in # the order the populations are represented in the @ldSelect_outputs array my @bin_locations = (); # initialize @bin_locations array for my $i (0..$tag_index-1) { push @bin_locations, 0; } push @bin_locations, $first_bin; for my $i ($tag_index+1..$num_pops-1) { push @bin_locations, 0; } # assign each tagSNP a sequence context score (default = 0) my $context_score = ScoreTagSNP($snp); # store information about the current tagSNP $$tagSNP_info{$snp} = { context_score => $context_score, bin_locations => \@bin_locations, num_bins_tagged => 1 }; # if a file of site scores was provided, check to make sure that # all observed tagSNPs have scores - if not, assign them scores of zero if (!exists $site_scores{$snp}) { $site_scores{$snp} = 0; } } # Algorithm Step 1: # Assign tagSNP vectors to mutually exclusive clusters such that a) the "maximally # informative" vectors (those with the greatest number of informative entries) in # a cluster are identical and b) the other vectors in a cluster contain no information # not found in the maximally informative vectors. Choose one SNP from each cluster # (either at random or on the basis of some desirable quantity, like genomic # sequence context). These "representative" SNPs are stored in the @representatives # array, which is passed by reference to this function. sub ChooseMaximallyInformativeSNPs(\@\@\@) { my ($snp_array, $redundants, $representatives) = @_; # sort SNPs by the number of bins they tag, with more-informative SNPs nearer # the front of the list SortSitesByBinContent(@$snp_array); print STDERR "Sites sorted by bin content.\n" if ($verbose eq 'Y'); my @remaining_snps = @$snp_array; my $previous_percentage = 0; # each pass through this loop identifies one cluster of tagSNPs and chooses # a maximally informative SNP to represent that cluster while (scalar(@remaining_snps) > 0) { my @temp; my @maximally_informative_snps = ($remaining_snps[0]); my @bin_locations_1 = @{$tag_snp_info{$remaining_snps[0]}->{bin_locations}}; for my $i (1..$#remaining_snps) { my @bin_locations_2 = @{$tag_snp_info{$remaining_snps[$i]}->{bin_locations}}; if (AreBinLocationsEquivalent(@bin_locations_1, @bin_locations_2)) { if ($tag_snp_info{$remaining_snps[0]}->{num_bins_tagged} == $tag_snp_info{$remaining_snps[$i]}->{num_bins_tagged}) { push @maximally_informative_snps, $remaining_snps[$i]; } elsif (exists($required_snps{$remaining_snps[$i]})) { push @$representatives, $remaining_snps[$i]; } else { push @$redundants, $remaining_snps[$i]; } } else { push @temp, $remaining_snps[$i]; } } @remaining_snps = @temp; # choose the best SNP (the "representative") from the current cluster of # maximally informative SNPs my $representative = ChooseRepresentativeSNP(@maximally_informative_snps); push @$representatives, $representative; # now that a representative SNP has been chosen for the current cluster of # maximally informative SNPs, we can eliminate the others foreach my $snp (@maximally_informative_snps) { next if ($snp eq $representative); if (exists($required_snps{$snp})) { push @$representatives, $snp; next; } push @$redundants, $snp; } if ($verbose eq 'Y' && (100 - 100*scalar(@remaining_snps)/scalar(@$snp_array)) > ($previous_percentage + 1)) { print STDERR "First algorithm step "; print STDERR sprintf("%.0f", 100-100*scalar(@remaining_snps)/scalar(@$snp_array)); print STDERR "% complete.\n"; $previous_percentage += 1; } } } # sort SNPs by the number of bins they tag, with more informative SNPs nearer # the front of the list sub SortSitesByBinContent(\@) { my $snp_array = shift; my @tagged_bin_tallies; # parallels @$snp_array my @sorted_snp_array; # determine how many informative entries each SNP's vector contains foreach my $snp (@$snp_array) { push @tagged_bin_tallies, $tag_snp_info{$snp}->{num_bins_tagged}; } # sort SNPs in order of decreasing number of tagged bins my @sorted_bin_tallies = reverse sort {$a <=> $b} @tagged_bin_tallies; my $previous_tally = 0; my $previous_percentage = 0; my $previous_tag_index = -1; for my $i (0..$#sorted_bin_tallies) { if ($verbose eq 'Y' && 100*($i+1)/scalar(@sorted_bin_tallies) > ($previous_percentage + 1)) { print STDERR sprintf("%.0f",100*($i+1)/scalar(@sorted_bin_tallies)),"% of sites sorted.\n"; $previous_percentage += 1; } if ($sorted_bin_tallies[$i] != $previous_tally) { $previous_tag_index = -1; } for my $j ($previous_tag_index+1..$#tagged_bin_tallies) { if ($sorted_bin_tallies[$i] == $tagged_bin_tallies[$j]) { push @sorted_snp_array, $$snp_array[$j]; $previous_tag_index = $j; last; } } $previous_tally = $sorted_bin_tallies[$i]; } @$snp_array = @sorted_snp_array; } # check to see if the bins tagged by one SNP are "equivalent" to the bins tagged # by a second SNP, which must tag the same number or fewer bins; here, # "equivalent" means that there is at least one population in which both SNPs # are tagSNPs and that these SNPs tag the same bins in all such populations sub AreBinLocationsEquivalent(\@\@) { my $bin_locations_1 = shift; my $bin_locations_2 = shift; my $comparison_made = 0; # boolean variable - were there any populations in which the # two tagSNPs in question were both present? for my $i (0..$#$bin_locations_1) { next if ($$bin_locations_2[$i] == 0); $comparison_made = 1; return 0 if ($$bin_locations_1[$i] != $$bin_locations_2[$i]); } # if the two tagSNPs in question are not simultaneously present in any population, they # are not equivalent for the purposes of this analysis return $comparison_made; } # return the tagSNP that has the highest precedence in a given group of maximally # informative SNPs (the input array contains one such group); if required SNPs are # present, automatically choose the first one; if multiple tagSNPs tie, choose the # one that has the best context score; if multiple tagSNPs still tie, choose the # one that has the best site score; break any remaining ties by choosing the first # tagSNP in the list sub ChooseRepresentativeSNP(\@) { my $max_inform_snps = shift; # degenerate case: if there is only one maximally informative SNP, it is # automatically chosen to be the representative return ($$max_inform_snps[0]) if (scalar(@$max_inform_snps) == 1); # if the previous return statement isn't triggered, the maximally informative set # contains more than one SNP; therefore, we need to choose among these SNPs my $best_context_score = 0; my @best_scoring_snps = (); foreach my $snp (@$max_inform_snps) { # if there is a required SNP among the maximally informative SNPs, # it is automatically selected if (exists($required_snps{$snp})) { return $snp; } my $context_score = $tag_snp_info{$snp}->{context_score}; if ($context_score > $best_context_score) { $best_context_score = $context_score; @best_scoring_snps = (); push (@best_scoring_snps, $snp); } elsif ($context_score == $best_context_score) { push (@best_scoring_snps, $snp); } } if ($scores ne "" && scalar(@best_scoring_snps) > 1) { my $best_site_score = 0; my @high_scoring_snps = (); foreach my $snp (@best_scoring_snps) { my $site_score = $site_scores{$snp}; if ($site_score > $best_site_score) { $best_site_score = $site_score; @high_scoring_snps = (); push @high_scoring_snps, $snp; } elsif ($site_score == $best_site_score) { push @high_scoring_snps, $snp; } } return $high_scoring_snps[0]; } else { return $best_scoring_snps[0]; } } # Algorithm Step 2a: # Assemble maximally informative tagSNPs into a list, either in random order # or ranked by some desirable qualities (like genomic and sequence context). sub CompileRankedList(\@\@) { my ($snp_list, $ranked_list) = @_; # direct each tagSNP to the proper sequence context array foreach my $snp (@$snp_list) { SendSNPToListSegment($snp); } # sort each sequence context array in order of increasing site scores SortArrayBySiteScore(@RF3_snps); SortArrayBySiteScore(@RI_snps); SortArrayBySiteScore(@RF5_snps); SortArrayBySiteScore(@RU_snps); SortArrayBySiteScore(@RS_snps); SortArrayBySiteScore(@RN_snps); SortArrayBySiteScore(@RT_snps); SortArrayBySiteScore(@UF3_snps); SortArrayBySiteScore(@UI_snps); SortArrayBySiteScore(@UF5_snps); SortArrayBySiteScore(@UU_snps); SortArrayBySiteScore(@US_snps); SortArrayBySiteScore(@UN_snps); SortArrayBySiteScore(@UT_snps); # consolidate sequence context arrays my @repeat_seq_snps = (@RF3_snps,@RI_snps,@RF5_snps,@RU_snps,@RS_snps,@RN_snps,@RT_snps); my @unique_seq_snps = (@UF3_snps,@UI_snps,@UF5_snps,@UU_snps,@US_snps,@UN_snps,@UT_snps); @$ranked_list = (@repeat_seq_snps, @unique_seq_snps); # This option is mostly for testing purposes; there's no particular # reason to randomize the list order in practice (it will obliterate # the previously established precedence hierarchy). If desired, however, # simply removing the comment below will have that effect. #RandomizeList(@$ranked_list); } # assign a SNP to the proper list segment (a list segment is an array # containing all SNPs with the same sequence and genomic context) sub SendSNPToListSegment($) { my $snp = shift; # default: send SNP to @RF3_snps array if it lacks sequence context info if (!exists($context_info{$snp})) { push @RF3_snps, $snp; return; } # (U)nique or (R)epetitive? my $sequence_context = $context_info{$snp}->{sequence}; # (N)onsynonymous, (S)ynonymous, (U)ntranslated region, (F)lanking, # (I)ntron, (T) frameshift, or (X) other? my $genomic_context = $context_info{$snp}->{genomic}; # assign the SNP to the proper list segment if ($sequence_context eq 'R') { if ($genomic_context eq 'F') { # distinguish between 5'- and 3'-flanking SNPs if (exists($context_info{$snp}->{location}) && $context_info{$snp}->{location} eq '5-prime') { push @RF5_snps, $snp; } else { push @RF3_snps, $snp; } } elsif ($genomic_context eq 'I') { push @RI_snps, $snp; } elsif ($genomic_context eq 'U') { push @RU_snps, $snp; } elsif ($genomic_context eq 'S') { push @RS_snps, $snp; } elsif ($genomic_context eq 'N') { push @RN_snps, $snp; } elsif ($genomic_context eq 'T') { push @RT_snps, $snp; } else { unshift @RF3_snps, $snp; } } elsif ($sequence_context eq 'U') { if ($genomic_context eq 'F') { # distinguish between 5'- and 3'-flanking SNPs if (exists($context_info{$snp}->{location}) && $context_info{$snp}->{location} eq '5-prime') { push @UF5_snps, $snp; } else { push @UF3_snps, $snp; } } elsif ($genomic_context eq 'I') { push @UI_snps, $snp; } elsif ($genomic_context eq 'U') { push @UU_snps, $snp; } elsif ($genomic_context eq 'S') { push @US_snps, $snp; } elsif ($genomic_context eq 'N') { push @UN_snps, $snp; } elsif ($genomic_context eq 'T') { push @UT_snps, $snp; } else { unshift @UF3_snps, $snp; } } else { unshift @RF3_snps, $snp; } } # sort the SNPs in the input array in order of increasing site scores sub SortArrayBySiteScore(\@) { my $snp_array = shift; return if (scalar(@$snp_array) == 0); my @site_score_array; # parallels @$snp_array my @sorted_snp_array; # catalogue site scores for the relevant SNPs foreach my $snp (@$snp_array) { push @site_score_array, $site_scores{$snp}; } # sort SNPs in order of increasing site scores my @sorted_site_scores = sort {$a <=> $b} @site_score_array; my $previous_score = 0; my $previous_tag_index = -1; for my $i (0..$#sorted_site_scores) { if ($sorted_site_scores[$i] != $previous_score) { $previous_tag_index = -1; } for my $j ($previous_tag_index+1..$#site_score_array) { if ($sorted_site_scores[$i] == $site_score_array[$j]) { push @sorted_snp_array, $$snp_array[$j]; $previous_tag_index = $j; last; } } $previous_score = $sorted_site_scores[$i]; } @$snp_array = @sorted_snp_array; } # take an array, shuffle its elements into a new random order, and return the # randomized array sub RandomizeList(\@) { my $original_list = shift; my @temp_list = (); while (@$original_list) { push (@temp_list, splice(@$original_list, rand(@$original_list), 1)); } @$original_list = @temp_list; } # Algorithm Step 2b: # Traverse the tagSNP list, removing each SNP in turn. If no bin in any population # loses representation, discard the SNP; otherwise, return it to the list. sub FindMinimalTagSNPSet(\@\@\@) { my ($ranked_snps, $discarded_representative_snps, $near_minimal_set) = @_; my @ranked_snp_list = @$ranked_snps; my @covered_bins_repository = (); FillCoveredBinsRepository(@covered_bins_repository, @ranked_snp_list); @representative_snp_bin_repository = DeepCopy(@covered_bins_repository) if ($optimal eq 'Y'); my $list_posn = 0; while ($list_posn <= $#ranked_snp_list) { # discard the current SNP if the bins it covers can be covered by other SNPs in the list if (IsSNPExpendable(@covered_bins_repository, $ranked_snp_list[$list_posn])) { push @$discarded_representative_snps, $ranked_snp_list[$list_posn]; @ranked_snp_list = (@ranked_snp_list[0..$list_posn-1], @ranked_snp_list[$list_posn+1..$#ranked_snp_list]); } else # leave the @ranked_snps list intact { $list_posn++; } } print STDERR "Second algorithm step complete.\n" if ($verbose eq 'Y'); @$near_minimal_set = @ranked_snp_list; } # fill a repository (first input array) with a list of arrays containing the # bins captured in each population by a list of SNPs (second input array) sub FillCoveredBinsRepository(\@\@) { my ($repository, $snp_array) = @_; for my $i (0..$num_pops-1) { my @bins_covered = (); foreach my $snp (@$snp_array) { my $bin = $tag_snp_info{$snp}->{bin_locations}->[$i]; next if ($bin == 0); if (scalar(@bins_covered) < $bin) { for my $j (scalar(@bins_covered)..$bin-1) { $bins_covered[$j] = 0; } } $bins_covered[$bin-1]++; } push (@$repository, \@bins_covered); } } # the input array should contain a list of array references sub DeepCopy(\@) { my $array_to_copy = shift; my @return_array; foreach my $array_entry (@$array_to_copy) { my @array_entry_copy = @$array_entry; push @return_array, \@array_entry_copy; } return @return_array; } # can this SNP be discarded without causing any bin to lose representation? sub IsSNPExpendable(\@$) { my ($repository, $snp) = @_; return 0 if (exists($required_snps{$snp})); # required SNPs are never expendable my @bins_lost = @{$tag_snp_info{$snp}->{bin_locations}}; for my $i (0..$#bins_lost) { next if ($bins_lost[$i] == 0); return 0 if ($$repository[$i]->[$bins_lost[$i]-1] == 1); } # if this part of the function is reached, the queried SNP is expendable; # this being the case, we need to remove its entries from the covered bins repository for my $i (0..$#bins_lost) { next if ($bins_lost[$i] == 0); $$repository[$i]->[$bins_lost[$i]-1]--; } return 1; } # is the input element present in the input array? sub IsElementInArray(\@$) { my $array_ref = shift; my $array_entry = shift; foreach my $element (@$array_ref) { if ($element eq $array_entry) { return 1; } } return 0; } # defaults to zero if the tagSNP has no sequence context info; note that # the magnitude of a tagSNP's score does not affect the algorithm's decisions # since it is concerned with RELATIVE scores only sub ScoreTagSNP($) { my $tag_snp = shift; my $score = 0; my $first_letter = ''; my $second_letter = ''; return $score if (!exists($context_info{$tag_snp})); # U(nique) or R(epetitive)? my $sequence_context = $context_info{$tag_snp}->{sequence}; # N(onsynonymous), S(ynonymous), U(ntranslated region), F(lanking), # I(ntron), or frameshif(T)? my $genomic_context = $context_info{$tag_snp}->{genomic}; # all SNPs in unique sequence receive higher scores than SNPs in # repetitive sequence if ($sequence_context eq 'U') { $score += 7; } if ($genomic_context eq 'F') { if (exists($context_info{$tag_snp}->{location}) && $context_info{$tag_snp}->{location} eq '5-prime') { $score += 3; } else { $score += 1; } } elsif ($genomic_context eq 'I') { $score += 2; } elsif ($genomic_context eq 'U') { $score += 4; } elsif ($genomic_context eq 'S') { $score += 5; } elsif ($genomic_context eq 'N') { $score += 6; } elsif ($genomic_context eq 'T') { $score += 7; } return $score; } # sort SNPs and attach context IDs (if present) and bin info sub SortSelectedSNPs(\@) { my $selected_snps = shift; my @sorted_snps; # sort SNPs (either numerically or alphabetically, as appropriate) if (scalar(@$selected_snps) > 0) { if ($$selected_snps[0] =~ /\D/) { @sorted_snps = sort @$selected_snps; } else { @sorted_snps = sort {$a <=> $b} @$selected_snps; } } foreach my $snp (@sorted_snps) { my @bin_locations = @{$tag_snp_info{$snp}->{bin_locations}}; # append context info onto each SNP (if present) if (exists($context_info{$snp})) { my $context_id = $context_info{$snp}->{sequence}; $context_id .= $context_info{$snp}->{genomic}; $snp = $context_id.'-'.$snp; } # append bin information onto each SNP $snp .= "-"; for my $i (0..$#bin_locations) { $snp .= "$bin_locations[$i]"; $snp .= "/" unless ($i == $#bin_locations); } } @$selected_snps = @sorted_snps; } # print program output sub PrintResults(\@) { my $final_snp_set = shift; print "num_tagSNPs: ", scalar(@$final_snp_set), "\n"; print "Selected_tagSNPs: @$final_snp_set\n\n"; } # parse the command line sub ParseCommandLine() { my $usage = "\nmultiPopTagSelect.pl\n"; $usage .= " [-ld_select] file containing a list of ldSelect output file names\n"; $usage .= " <-context> file containing genomic and sequence context info\n"; $usage .= " <-scores> file containing SNP design scores\n"; $usage .= " <-excluded> file containing a list of SNPs that must not be selected\n"; $usage .= " <-required> file containing a list of SNPs that must be selected\n"; $usage .= " <-optimal> seek a provably optimal solution? (y or n; default = n)\n"; $usage .= " <-verbose> display program progress? (y or n; default = n)\n\n"; # default values $arg{-ld_select} = ""; $arg{-context} = ""; $arg{-scores} = ""; $arg{-excluded} = ""; $arg{-required} = ""; $arg{-optimal} = 'N'; $arg{-verbose} = 'N'; # parse the command line for (my $i = 0; $i <= $#ARGV; $i++) { if ($ARGV[$i] =~ /^-/) { $arg{$ARGV[$i]} = $ARGV[$i+1]; } } $arg{-optimal} = uc($arg{-optimal}); $arg{-verbose} = uc($arg{-verbose}); die ($usage) if (!$arg{-ld_select}); die ($usage) if ($arg{-optimal} ne 'N' && $arg{-optimal} ne 'Y'); die ($usage) if ($arg{-verbose} ne 'N' && $arg{-verbose} ne 'Y'); } ################################################################### # OPTIMAL SNP SELECTION FUNCTIONS # ################################################################### # check to see whether the current solution is truly minimal; if not, # replace it with an optimal solution sub AreDiscardedSNPsOptimal(\@) { my $best_current_solution = shift; my $are_discarded_snps_optimal = 1; # boolean return variable my @discarded_snp_clusters; my %distinct_overlapping_snps; my (@redundant_selected_snps, @unique_selected_snps); # looking back to the algorithm's second step, determine which selected SNPs # uniquely tagged a bin (note that these SNPs are a superset of "mandatory" SNPs) # and which SNPs could have been discarded under at least one possible list order; # refer to the first group as "unique" SNPs and the second as "redundant" SNPs foreach my $selected_snp (@$best_current_solution) { if (IsSNPEverExpendable(@representative_snp_bin_repository, $selected_snp)) { push @redundant_selected_snps, $selected_snp; } else { push @unique_selected_snps, $selected_snp; } } return 1 if (scalar(@redundant_selected_snps) == 0); # identify (redundant) selected SNPs that tag some of the same bins as SNPs that # were discarded in the second step of the algorithm; refer to these selected SNPs # (and the corresponding discarded SNPs) as "overlapping" since their bin coverage # vectors "overlap" in the sense of sharing at least one (non-zero) element # this hash keeps track of which discarded SNPs are "singly connected"; each such # SNP overlaps with only one redundant selected SNP my %is_snp_singly_connected; foreach my $discarded_snp (@discarded_snps) { my (@overlapping_selected_snps, @other_snps); @other_snps = @unique_selected_snps; foreach my $selected_snp (@redundant_selected_snps) { if (DoesBinCoverageOverlap($selected_snp, $discarded_snp)) { push @overlapping_selected_snps, $selected_snp; } else { push @other_snps, $selected_snp; } } if (scalar(@overlapping_selected_snps) == 1) { $is_snp_singly_connected{$discarded_snp} = 1; } else { $discarded_snp_info{$discarded_snp}->{overlap} = \@overlapping_selected_snps; $discarded_snp_info{$discarded_snp}->{other} = \@other_snps; } } # cluster discarded SNPs based on shared overlapping selected SNPs my %is_snp_clustered; for my $i (0..$#discarded_snps) { next if ($is_snp_singly_connected{$discarded_snps[$i]}); next if ($is_snp_clustered{$discarded_snps[$i]}); my @current_overlaps = @{$discarded_snp_info{$discarded_snps[$i]}->{overlap}}; if (scalar(@current_overlaps) == 0) { $is_snp_clustered{$discarded_snps[$i]} = 1; next; } my @current_cluster = ($discarded_snps[$i]); $is_snp_clustered{$discarded_snps[$i]} = 1; my $sentinel = $discarded_snps[$i]; my $list_posn = $i; do { if (!exists($is_snp_clustered{$discarded_snps[$list_posn]}) && !exists($is_snp_singly_connected{$discarded_snps[$list_posn]})) { my @overlaps = @{$discarded_snp_info{$discarded_snps[$list_posn]}->{overlap}}; foreach my $overlap_snp (@overlaps) { if (IsElementInArray(@current_overlaps, $overlap_snp)) { push @current_cluster, $discarded_snps[$list_posn]; $is_snp_clustered{$discarded_snps[$list_posn]} = 1; $sentinel = $discarded_snps[$list_posn]; foreach my $overlap (@overlaps) { push @current_overlaps, $overlap if (!IsElementInArray(@current_overlaps, $overlap)); } last; } } } $list_posn++; $list_posn = 0 if ($list_posn > $#discarded_snps); } while ($discarded_snps[$list_posn] ne $sentinel); push @discarded_snp_clusters, \@current_cluster; $distinct_overlapping_snps{$discarded_snps[$i]} = \@current_overlaps; # --------------------------------------------------------------------------------- # For debugging: # --------------------------------------------------------------------------------- # print STDERR "|@current_cluster| - |@current_overlaps|\n"; # # foreach my $discard (@current_cluster) # { # print STDERR "|$discard| (@{$tag_snp_info{$discard}->{bin_locations}}):\n"; # # foreach my $overlap (@current_overlaps) # { # print STDERR " |$overlap| (@{$tag_snp_info{$overlap}->{bin_locations}})\n"; # } # # print STDERR "\n"; # } # --------------------------------------------------------------------------------- } my %snp_removal_templates; CreateSNPRemovalTemplates(@discarded_snp_clusters, %distinct_overlapping_snps, %snp_removal_templates); my %was_selected_snp_replaced; # keeps track of selected SNPs replaced by discarded SNPs my @repatriated_discards; CLUSTER: foreach my $discarded_snp_cluster (@discarded_snp_clusters) { my @distinct_overlapping_snps = @{$distinct_overlapping_snps{$$discarded_snp_cluster[0]}}; my @distinct_other_snps; my @discard_removal_templates = @{$snp_removal_templates{scalar(@$discarded_snp_cluster)}}; my @overlap_removal_templates = @{$snp_removal_templates{scalar(@distinct_overlapping_snps)}}; # define @distinct_other_snps as the intersection of the "other SNPs" for all discarded # SNPs in the current cluster; we will not try to replace the @distinct_other_SNPs this # time through the CLUSTER loop foreach my $other_snp (@{$discarded_snp_info{$$discarded_snp_cluster[0]}->{other}}) { next if ($was_selected_snp_replaced{$other_snp}); my $is_always_other = 1; # boolean variable - is this SNP an "other SNP" for # every discarded SNP in this cluster? for my $i (1..$#$discarded_snp_cluster) { my @overlap_snps = @{$discarded_snp_info{$$discarded_snp_cluster[$i]}->{overlap}}; if (IsElementInArray(@overlap_snps, $other_snp)) { $is_always_other = 0; last; } } push @distinct_other_snps, $other_snp if ($is_always_other); } my @smallest_snp_sets; my @replaced_selected_snps = (); my @repatriated_discarded_snps = (); my $was_substitution_made = 0; # boolean variable my @current_snp_set = @$best_current_solution; push @smallest_snp_sets, \@current_snp_set; foreach my $discard_template (@discard_removal_templates) { foreach my $overlap_template (@overlap_removal_templates) { next if (!AreTemplatesRelevant(@$discard_template, @$overlap_template)); # @residual_snps are SNPs that remain after purging the SNPs dictated by the # template instructions; i.e., they include all @distinct_other_snps and subsets # of the discarded SNPs and overlapping selected SNPs in the current cluster my (@residual_snps, @purged_snps, @replacement_snps, @replaced_snps); # execute the template instructions for the discarded SNPs in the current # cluster; here, @replacement_snps are those that we will attempt to # substitute for redundant selected SNPs for my $i (0..$#$discard_template) { if ($$discard_template[$i] eq '1') { push @residual_snps, $$discarded_snp_cluster[$i]; push @replacement_snps, $$discarded_snp_cluster[$i]; } } # execute the template instructions for the overlapping selected SNPs # corresponding to the current cluster; here, @replaced_snps are those # that will be deemed unnecessary if the proposed substition leads to a # more parsimonious solution for my $i (0..$#$overlap_template) { if ($$overlap_template[$i] eq '1') { push @residual_snps, $distinct_overlapping_snps[$i]; } else { push @replaced_snps, $distinct_overlapping_snps[$i]; } } push @residual_snps, @distinct_other_snps; push @residual_snps, @repatriated_discards; # if the @residual_snps capture all of the bins lost through eliminating # the @replaced_snps, they represent a valid alternative to the original # selected SNP set if (DoResidualSNPsAccountForAllBins(@residual_snps, @replaced_snps)) { # print STDERR "Replacement: |@replacement_snps|\n"; # print STDERR "Replaced: |@replaced_snps|\n"; # if the new SNP set is smaller than any existing ones, store it as # the uniquely smallest set thus far; if it is the same size as # existing sets, store it as an equivalent alternative if (scalar(@residual_snps) < scalar(@{$smallest_snp_sets[0]})) { $are_discarded_snps_optimal = 0; $was_substitution_made = 1; @smallest_snp_sets = (\@residual_snps); @replaced_selected_snps = (\@replaced_snps); @repatriated_discarded_snps = (\@replacement_snps); } #---------------------------------------------------------------------------------- # NOTE: this conditional clause might prove useful later if we want to # choose between minimal sets of the same size based on additional criteria # elsif (scalar(@residual_snps) == scalar(@{$smallest_snp_sets[0]})) # { # push @smallest_snp_sets, \@residual_snps; # push @replaced_selected_snps, \@replaced_snps; # } #---------------------------------------------------------------------------------- } } } if ($was_substitution_made) { @$best_current_solution = @{$smallest_snp_sets[0]}; foreach my $replaced_snp (@{$replaced_selected_snps[0]}) { $was_selected_snp_replaced{$replaced_snp} = 1; } push @repatriated_discards, @{$repatriated_discarded_snps[0]}; } } return $are_discarded_snps_optimal; } # could this SNP be discarded from any possible list? sub IsSNPEverExpendable(\@$) { my ($repository, $snp) = @_; return 0 if (exists($required_snps{$snp})); # required SNPs are never discardable my @bins_lost = @{$tag_snp_info{$snp}->{bin_locations}}; for my $i (0..$#bins_lost) { next if ($bins_lost[$i] == 0); return 0 if ($$repository[$i]->[$bins_lost[$i]-1] == 1); } # if this part of the function is reached, the queried SNP is expendable # in at least one list order return 1; } # do these two SNPs tag any of the same bins? sub DoesBinCoverageOverlap($$) { my ($snp_1, $snp_2) = @_; my @snp_1_bins = @{$tag_snp_info{$snp_1}->{bin_locations}}; my @snp_2_bins = @{$tag_snp_info{$snp_2}->{bin_locations}}; for my $i (0..$#snp_1_bins) { if ($snp_1_bins[$i] != 0 && $snp_1_bins[$i] == $snp_2_bins[$i]) { return 1; } } return 0; } # sub AreArraysIdentical(\@\@) { my ($array_1, $array_2) = @_; return 0 if (scalar(@$array_1) != scalar(@$array_2)); for my $i (0..$#$array_1) { return 0 if ($$array_1[$i] ne $$array_2[$i]); } return 1; } # make a set of template arrays that dictate relevant SNP removal # patterns (a pattern is not relevant if we can determine, a priori, # that it will not improve our situation) sub CreateSNPRemovalTemplates(\@\%\%) { my ($discard_clusters, $distinct_overlaps, $removal_templates) = @_; my @relevant_templates; # create SNP removal templates for discarded SNP clusters foreach my $discard_cluster (@$discard_clusters) { next if (exists $$removal_templates{scalar(@$discard_cluster)}); my @composite_array; for my $i (1..scalar(@$discard_cluster)) { my @binary_array = (0, 1); push @composite_array, \@binary_array; } my @removal_templates = AssemblePatternArrays(@composite_array); $$removal_templates{scalar(@$discard_cluster)} = \@removal_templates; } # create SNP removal templates for the union of selected SNPs that overlap # each discarded SNP cluster foreach my $discard_cluster (@$discard_clusters) { my @distinct_overlapping_snps = @{$$distinct_overlaps{$$discard_cluster[0]}}; next if (exists $$removal_templates{scalar(@distinct_overlapping_snps)}); my @composite_array; for my $i (1..scalar(@distinct_overlapping_snps)) { my @binary_array = (0, 1); push @composite_array, \@binary_array; } my @removal_templates = AssemblePatternArrays(@composite_array); $$removal_templates{scalar(@distinct_overlapping_snps)} = \@removal_templates; } } # sub AssemblePatternArrays(\@) { my $composite_array = shift; my @pattern_arrays; my @first_array = @{$$composite_array[0]}; for my $i (0..$#first_array) { $pattern_arrays[$i][0] = $first_array[$i]; } for my $i (1..$#$composite_array) { my @current_array = @{$$composite_array[$i]}; my $num_elements = scalar(@current_array); ReplicatePatternArrays(@pattern_arrays, $num_elements-1); my $index = 0; foreach my $element (@current_array) { for my $j (1..scalar(@pattern_arrays)/$num_elements) { $pattern_arrays[$index][$i] = $element; $index++; } } } # -------------------------------------------------------- # For debugging: # -------------------------------------------------------- # for my $i (0..$#pattern_arrays) # { # for my $j (0..$#{$pattern_arrays[$i]}) # { # print STDERR "$pattern_arrays[$i][$j] "; # } # # print STDERR "\n"; # } # -------------------------------------------------------- return @pattern_arrays; } # sub ReplicatePatternArrays(\@$) { my ($pattern_arrays, $num_reps) = @_; my @base_array = @$pattern_arrays; my $next_index = $#$pattern_arrays + 1; for my $i (1..$num_reps) { for my $j (0..$#base_array) { @{$$pattern_arrays[$next_index]} = @{$base_array[$j]}; $next_index++; } } } # to improve the solution, N discarded SNPs must replace at least # N+1 selected SNPs sub AreTemplatesRelevant(\@\@) { my ($discard_template, $overlap_template) = @_; my $num_discards_retained = 0; my $num_overlaps_eliminated = 0; foreach my $element (@$discard_template) { $num_discards_retained++ if ($element eq '1'); } foreach my $element (@$overlap_template) { $num_overlaps_eliminated++ if ($element eq '0'); } if ($num_discards_retained > 0 && $num_discards_retained < $num_overlaps_eliminated) { return 1; } else { return 0; } } # sub DoResidualSNPsAccountForAllBins(\@\@) { my ($residual_snps, $purged_snps) = @_; my (%residual_bin_indicators, %purged_bin_indicators); # assign %residual_bin_indicators foreach my $residual_snp (@$residual_snps) { my @bins = @{$tag_snp_info{$residual_snp}->{bin_locations}}; for my $i (0..$num_pops-1) { $residual_bin_indicators{$i}->{$bins[$i]} = 1; } } # assign %purged_bin_indicators foreach my $purged_snp (@$purged_snps) { my @bins = @{$tag_snp_info{$purged_snp}->{bin_locations}}; for my $i (0..$num_pops-1) { $purged_bin_indicators{$i}->{$bins[$i]} = 1; } } # determine whether all bins that were purged are still represented # by the residual SNPs for my $i (0..$num_pops-1) { foreach my $purged_bin (keys %{$purged_bin_indicators{$i}}) { if (!exists($residual_bin_indicators{$i}->{$purged_bin})) { return 0; } } } return 1; }