#!/usr/bin/perl
# This program attempts to select a near-minimal set of tagSNPs that capture
# all of the ldSelect bins in N populations, N>=1.

#############################################################################################
#/-----------------------------------------------------------------------------------------\#
#|                                                                                         |#
#|  Program: multiPopTagSelect                                                             |#
#|  Version: 1.1                                                                           |#
#|  Copyright (C) 2005-2006                                                                |#
#|  by Bryan N. Howie, Christopher S. Carlson, Mark J. Rieder, and Deborah A. Nickerson    |#
#|  University of Washington                                                               |#
#|                                                                                         |#
#|                                                                                         |#
#|  All rights reserved.                                                                   |#
#|                                                                                         |#
#|  You have the permission to use and develop multiPopTagSelect.pl ("the software"),      |#
#|  provided that the following conditions are met:                                        |#
#|                                                                                         |#
#|  1. The software is not published, distributed, or otherwise transferred or made        |#
#|     available to others.                                                                |#
#|                                                                                         |#
#|  2. If utilization of the software results in outcomes which will be published, please  |#
#|     specify the version of the software you used and cite attributions noted above.     |#
#|                                                                                         |#
#|  3. You acknowledge that University of Washington ("UW") and the UW developers may      |#
#|     develop modifications to the software that may be substantially similar to your     |#
#|     modifications of the software and that UW and UW developers shall not be            |#
#|     constrained in any way by you in UW's and UW developers' use or management of       |#
#|     such modifications. You acknowledge the right of the UW and UW developers to        |#
#|     prepare and publish modifications to the software that may be substantially         |#
#|     similar or functionally equivalent to your modifications and improvements, and if   |#
#|     you obtain patent protection for any modification or improvement to the software,   |#
#|     you agree not to allege or enjoin infringement of your patent by the UW and UW      |#
#|     developers.                                                                         |#
#|                                                                                         |#
#|  This software is provided "AS IS" and any express or implied warranties, including,    |#
#|  but not limited to, the implied warranties of merchantability and fitness for a        |#
#|  particular purpose are disclaimed. In particular, this disclaimer applies to any       |#
#|  diagnostic purpose. In no event shall the authors or the University of Washington be   |#
#|  liable for any direct, indirect, incidental, special, exemplary, or consequential      |#
#|  damages (including, but not limited to, procurement of substitute goods or services;   |#
#|  loss of use, data, or profits; or business interruption) however caused and on any     |#
#|  theory of liability, whether in contract, strict liability, or tort (including         |#
#|  negligence or otherwise) arising in any way out of the use of this software, even if   |#
#|  advised of the possibility of such damage.                                             |#
#|                                                                                         |#
#\-----------------------------------------------------------------------------------------/#
#############################################################################################

use strict;
use warnings;

# primary algorithm functions
sub ReadContextFile($);
sub ReadSiteScoresFile($);
sub ReadExcludedSNPsFile($);
sub ReadRequiredSNPsFile($);
sub FindDistinctTagSNPs($\@\%);
sub StoreContextInfo(\$);
sub InitializeSNPInfo(\%$$$);
sub ChooseMaximallyInformativeSNPs(\@\@\@);
sub SortSitesByBinContent(\@);
sub AreBinLocationsEquivalent(\@\@);
sub ChooseRepresentativeSNP(\@);
sub CompileRankedList(\@\@);
sub SendSNPToListSegment($);
sub SortArrayBySiteScore(\@);
sub RandomizeList(\@);
sub FindMinimalTagSNPSet(\@\@\@);
sub DeepCopy(\@);
sub FillCoveredBinsRepository(\@\@);
sub IsSNPExpendable(\@$);
sub IsElementInArray(\@$);
sub ScoreTagSNP($);
sub SortSelectedSNPs(\@);
sub PrintResults(\@);
sub ParseCommandLine();

# functions associated with finding a provably minimal tagSNP set
sub AreDiscardedSNPsOptimal(\@);
sub IsSNPEverExpendable(\@$);
sub DoesBinCoverageOverlap($$);
sub AreArraysIdentical(\@\@);
sub CreateSNPRemovalTemplates(\@\%\%);
sub AssemblePatternArrays(\@);
sub ReplicatePatternArrays(\@$);
sub AreTemplatesRelevant(\@\@);
sub DoResidualSNPsAccountForAllBins(\@\@);


ParseCommandLine();

# command line variables
my %arg;
my $ld_select_files = $arg{-ld_select};     # file containing a list of ldSelect output files
my $context = $arg{-context};               # file containing genomic and sequence context info
my $scores = $arg{-scores};                 # file containing scores for all sites
my $excluded = $arg{-excluded};             # file containing a list of SNPs that must not be selected
my $required = $arg{-required};             # file containing a list of SNPs that must be selected
my $optimal = $arg{-optimal};               # seek a provably optimal solution? (y or n)
my $verbose = $arg{-verbose};               # show program progress? (y or n)

# global variables
my $line;
my $num_pops = 0;
my (%tag_snp_info, %site_scores, %context_info, %discarded_snp_info);
my (%excluded_snps, %required_snps, @required_other_snps);
my (@representative_snps, @redundant_snps, @discarded_snps, @ranked_snps);
my (@distinct_tag_snps, @minimal_tag_snp_set);
my (@RF3_snps, @RI_snps, @RF5_snps, @RU_snps, @RS_snps, @RN_snps, @RT_snps);
my (@UF3_snps, @UI_snps, @UF5_snps, @UU_snps, @US_snps, @UN_snps, @UT_snps);
my @representative_snp_bin_repository;

# file paths and temporary file names
my @ldSelect_outputs = ();                          # ldSelect output files


MAIN: {

    # parse genomic and sequence context file, if provided
    ReadContextFile($context) if ($context ne "");

    # parse site scores file, if provided
    ReadSiteScoresFile($scores) if ($scores ne "");

    # parse file of SNPs that must NOT be selected, if provided
    ReadExcludedSNPsFile($excluded) if ($excluded ne "");

    # parse file of SNPs that must be selected, if provided
    ReadRequiredSNPsFile($required) if ($required ne "");

    # parse ldSelect files and catalogue every distinct tagSNP
    FindDistinctTagSNPs($ld_select_files, @distinct_tag_snps, %tag_snp_info);

    # Algorithm Step 1: assign tagSNPs to mutually exclusive clusters and choose a
    # single "maximally informative" tagSNP to represent each cluster
    ChooseMaximallyInformativeSNPs(@distinct_tag_snps, @redundant_snps, @representative_snps);

    # Algorithm Step 2a: assemble maximally informative tagSNPs into a list, either in
    # random order or ranked by some desirable quality (like genomic sequence context)
    CompileRankedList(@representative_snps, @ranked_snps);

    # Algorithm Step 2b: traverse the tagSNP list, removing each SNP in turn. If no bin
    # in any population loses representation, discard the SNP; otherwise, return it to
    # the list.
    FindMinimalTagSNPSet(@ranked_snps, @discarded_snps, @minimal_tag_snp_set);

    # if the -optimal flag was activated, check to see whether the current
    # solution is truly minimal; if not, replace it with an optimal solution
    if ($optimal eq 'Y')
    {
	my $initial_solution = scalar(@minimal_tag_snp_set);

	# check optimality of current solution and replace if necessary
	if (!AreDiscardedSNPsOptimal(@minimal_tag_snp_set))
	{
	    my $improved_solution = scalar(@minimal_tag_snp_set);

	    print STDERR "\nNon-optimal solution found: set reduced from";
	    print STDERR " $initial_solution SNPs to $improved_solution SNPs.\n\n";
	}
    }

    # add any required but non-tagging SNPs to the final set
    push @minimal_tag_snp_set, @required_other_snps;

    # sort SNPs alphabetically or numerically, as appropriate
    SortSelectedSNPs(@minimal_tag_snp_set);

    # print program output
    PrintResults(@minimal_tag_snp_set);

}



###################################################################
#                            FUNCTIONS                            #
###################################################################

# parse genomic and sequence context file
sub ReadContextFile($)
{
    my $context_file = shift;

    open(CONTEXT_FILE, "$context_file") || die $!;

    while (defined ($line = <CONTEXT_FILE>))
    {
	my ($site, $genomic_context, $sequence_context) = split /\s+/, $line;

	# assign identifier for genomic context
	if ($genomic_context =~ /flanking|Flanking/)
	{
	    if ($genomic_context =~ "5'-")
	    {
		$context_info{$site}->{location} = '5-prime';
	    }
	    elsif ($genomic_context =~ "3'-")
	    {
		$context_info{$site}->{location} = '3-prime';
	    }

	    $genomic_context = 'F';
	}
	elsif ($genomic_context =~ /utr|UTR/)
	{
	    if ($genomic_context =~ "5'-")
	    {
		$context_info{$site}->{location} = '5-prime';
	    }
	    elsif ($genomic_context =~ "3'-")
	    {
		$context_info{$site}->{location} = '3-prime';
	    }

	    $genomic_context = 'U';
	}
	elsif ($genomic_context =~ /intron|Intron/)
	{
	    $genomic_context = 'I';
	}
	elsif ($genomic_context =~ /nonsyn|Nonsyn/)
	{
	    $genomic_context = 'N';
	}
	elsif ($genomic_context =~ /^synon|^Synon/)
	{
	    $genomic_context = 'S';
	}
	elsif ($genomic_context =~ /frame-shift|Frame-Shift/)
	{
	    $genomic_context = 'T';
	}
	else
	{
	    $genomic_context = 'X';
	}

	# assign identifier for sequence context
	if ($sequence_context =~ /repeat|Repeat/)
	{
	    $sequence_context = 'R';
	}
	elsif ($sequence_context =~ /unique|Unique/)
	{
	    $sequence_context = 'U';
	}
	else
	{
	    $sequence_context = 'X';
	}

	$context_info{$site}->{sequence} = $sequence_context;
	$context_info{$site}->{genomic} = $genomic_context;
    }

    close(CONTEXT_FILE);
}


# parse site scores file
sub ReadSiteScoresFile($)
{
    my $scores_file = shift;

    open(SCORES_FILE, "$scores_file") || die $!;

    while (defined ($line = <SCORES_FILE>))
    {
	my ($site, $score) = split /\s+/, $line;
	$site_scores{$site} = $score;
    }

    close(SCORES_FILE);
}


# parse file of SNPs that must NOT be selected
sub ReadExcludedSNPsFile($)
{
    my $excluded_snps_file = shift;

    open(EXCLUDED_SNPS, "$excluded_snps_file") || die $!;

    while (defined ($line = <EXCLUDED_SNPS>))
    {
	my $snp_id = $line;
	chomp $snp_id;

	$excluded_snps{$snp_id} = 1;
    }

    close(EXCLUDED_SNPS);
}


# parse file of SNPs that must be selected
sub ReadRequiredSNPsFile($)
{
    my $required_snps_file = shift;

    open(REQUIRED_SNPS, "$required_snps_file") || die $!;

    while (defined ($line = <REQUIRED_SNPS>))
    {
	my $snp_id = $line;
	chomp $snp_id;

	$required_snps{$snp_id} = 1;
    }

    close(REQUIRED_SNPS);
}


# parse ldSelect files and catalogue every distinct tagSNP (UNLESS it has been
# designated for exclusion); assign each distinct tagSNP a vector detailing the
# bins it tags in each population
sub FindDistinctTagSNPs($\@\%)
{
    my ($ldSelect_files, $distinct_tagSNPs, $tagSNP_info) = @_;
    my %has_snp_been_seen;
    my %required_tags;      # keeps track of which required SNPs are also tagSNPs

    open(LD_FILE_NAMES, "$ldSelect_files") || die $!;

    while (defined ($line = <LD_FILE_NAMES>))
    {
	chomp $line;
	push @ldSelect_outputs, $line;
	$num_pops++;
    }

    close(LD_FILE_NAMES);

    # find every tagSNP that is represented in at least one ldSelect output file
    for my $i (0..$#ldSelect_outputs)
    {
	open(LD_SELECT, "$ldSelect_outputs[$i]") || die $!;

	my $line_counter = 0;
	my $current_bin = 0;

	while (defined ($line = <LD_SELECT>))
	{
	    $line_counter++;

	    if ($line_counter == 1)
	    {
		my @line_info = split /\s+/, $line;
		$current_bin = $line_info[1];
	    }
	    elsif ($line_counter == 2)
	    {
		my @line_info = split /\s+/, $line;
		my @tag_snps;
		@tag_snps = @line_info[3..$#line_info] if (scalar(@line_info) >= 4);

		# check for new tagSNPs
		foreach my $snp (@tag_snps)
		{
		    # if the third character is a hyphen, assume that this SNP
		    # was provided with genomic and sequence context information
		    if (length($snp) > 3 && substr($snp, 2, 1) eq '-')
		    {
			# remove context info from SNP and store in %context_info
			StoreContextInfo($snp);
		    }

		    next if (exists($excluded_snps{$snp}));

		    # if this SNP has not been observed as a tagSNP before, initialize it
		    if (!exists($has_snp_been_seen{$snp}))
		    {
			$has_snp_been_seen{$snp} = 1;
			InitializeSNPInfo(%$tagSNP_info, $snp, $i, $current_bin);
			push @distinct_tag_snps, $snp;

			if (exists($required_snps{$snp}))
			{
			    $required_tags{$snp} = 1;
			}
		    }
		    else   # this SNP has already bin noted as a tagSNP, so just update its info
		    {
			$$tagSNP_info{$snp}->{bin_locations}->[$i] = $current_bin;
			$$tagSNP_info{$snp}->{num_bins_tagged}++;
		    }
		}
	    }
	    elsif ($line_counter == 4)
	    {
		$line_counter = 0;        # reset line counter for the next bin
	    }
	}

	close(LD_SELECT);
    }

    # store any required SNPs that were not designated as tagSNPs in a special array
    foreach my $required_snp (keys %required_snps)
    {
	if (!exists($required_tags{$required_snp}))
	{
	    push @required_other_snps, $required_snp;

	    my @bin_locations = ();

	    for my $i (0..$num_pops-1)
	    {
		push @bin_locations, 0;
	    }

	    $$tagSNP_info{$required_snp}->{bin_locations} = \@bin_locations;
	}
    }
}


# remove context info from SNP and store in %context_info hash
sub StoreContextInfo(\$)
{
    my $snp = shift;
    my $sequence_context = substr($$snp, 0, 1);
    my $genomic_context = substr($$snp, 1, 1);
    $$snp = substr($$snp, 3, length($$snp)-3);

    if (exists($context_info{$$snp}))
    {
	# report an error and quit if this context info conflicts with previous info
	if ($context_info{$$snp}->{sequence} ne $sequence_context ||
	    $context_info{$$snp}->{genomic} ne $genomic_context)
	{
	    print STDERR "\nDiscrepancy between ldSelect file and sequence ";
	    print STDERR "context file for SNP $$snp.\nExiting now.\n\n";
	    exit;
	}
    }
    else
    {
	$context_info{$$snp}->{sequence} = $sequence_context;
	$context_info{$$snp}->{genomic} = $genomic_context;
    }
}


# initialize a hash containing important information about each tagSNP
sub InitializeSNPInfo(\%$$$)
{
    my ($tagSNP_info, $snp, $tag_index, $first_bin) = @_;

    # this array holds the bins that the current SNP tags in each population in
    # the order the populations are represented in the @ldSelect_outputs array
    my @bin_locations = ();

    # initialize @bin_locations array
    for my $i (0..$tag_index-1)
    {
	push @bin_locations, 0;
    }

    push @bin_locations, $first_bin;

    for my $i ($tag_index+1..$num_pops-1)
    {
	push @bin_locations, 0;
    }

    # assign each tagSNP a sequence context score (default = 0)
    my $context_score = ScoreTagSNP($snp);

    # store information about the current tagSNP
    $$tagSNP_info{$snp} = {
	context_score => $context_score,
	bin_locations => \@bin_locations,
	num_bins_tagged => 1
    };

    # if a file of site scores was provided, check to make sure that
    # all observed tagSNPs have scores - if not, assign them scores of zero
    if (!exists $site_scores{$snp})
    {
	$site_scores{$snp} = 0;
    }
}


# Algorithm Step 1:
# Assign tagSNP vectors to mutually exclusive clusters such that a) the "maximally
# informative" vectors (those with the greatest number of informative entries) in
# a cluster are identical and b) the other vectors in a cluster contain no information
# not found in the maximally informative vectors. Choose one SNP from each cluster
# (either at random or on the basis of some desirable quantity, like genomic
# sequence context). These "representative" SNPs are stored in the @representatives
# array, which is passed by reference to this function.
sub ChooseMaximallyInformativeSNPs(\@\@\@)
{
    my ($snp_array, $redundants, $representatives) = @_;

    # sort SNPs by the number of bins they tag, with more-informative SNPs nearer
    # the front of the list
    SortSitesByBinContent(@$snp_array);

    print STDERR "Sites sorted by bin content.\n" if ($verbose eq 'Y');

    my @remaining_snps = @$snp_array;
    my $previous_percentage = 0;

    # each pass through this loop identifies one cluster of tagSNPs and chooses
    # a maximally informative SNP to represent that cluster
    while (scalar(@remaining_snps) > 0)
    {
	my @temp;
	my @maximally_informative_snps = ($remaining_snps[0]);
	my @bin_locations_1 = @{$tag_snp_info{$remaining_snps[0]}->{bin_locations}};

	for my $i (1..$#remaining_snps)
	{
	    my @bin_locations_2 = @{$tag_snp_info{$remaining_snps[$i]}->{bin_locations}};

	    if (AreBinLocationsEquivalent(@bin_locations_1, @bin_locations_2))
	    {
		if ($tag_snp_info{$remaining_snps[0]}->{num_bins_tagged} ==
		    $tag_snp_info{$remaining_snps[$i]}->{num_bins_tagged})
		{
		    push @maximally_informative_snps, $remaining_snps[$i];
		}
		elsif (exists($required_snps{$remaining_snps[$i]}))
		{
		    push @$representatives, $remaining_snps[$i];
		}
		else
		{
		    push @$redundants, $remaining_snps[$i];
		}
	    }
	    else
	    {
		push @temp, $remaining_snps[$i];
	    }
	}

	@remaining_snps = @temp;

	# choose the best SNP (the "representative") from the current cluster of
	# maximally informative SNPs
	my $representative = ChooseRepresentativeSNP(@maximally_informative_snps);
	push @$representatives, $representative;

	# now that a representative SNP has been chosen for the current cluster of
	# maximally informative SNPs, we can eliminate the others
	foreach my $snp (@maximally_informative_snps)
	{
	    next if ($snp eq $representative);

	    if (exists($required_snps{$snp}))
	    {
		push @$representatives, $snp;
		next;
	    }

	    push @$redundants, $snp;
	}

	if ($verbose eq 'Y' &&
	    (100 - 100*scalar(@remaining_snps)/scalar(@$snp_array)) > ($previous_percentage + 1))
	{
	    print STDERR "First algorithm step ";
	    print STDERR sprintf("%.0f", 100-100*scalar(@remaining_snps)/scalar(@$snp_array));
	    print STDERR "% complete.\n";
	    $previous_percentage += 1;
	}
    }
}


# sort SNPs by the number of bins they tag, with more informative SNPs nearer
# the front of the list
sub SortSitesByBinContent(\@)
{
    my $snp_array = shift;
    my @tagged_bin_tallies;     # parallels @$snp_array
    my @sorted_snp_array;

    # determine how many informative entries each SNP's vector contains
    foreach my $snp (@$snp_array)
    {
	push @tagged_bin_tallies, $tag_snp_info{$snp}->{num_bins_tagged};
    }

    # sort SNPs in order of decreasing number of tagged bins
    my @sorted_bin_tallies = reverse sort {$a <=> $b} @tagged_bin_tallies;
    my $previous_tally = 0;
    my $previous_percentage = 0;
    my $previous_tag_index = -1;

    for my $i (0..$#sorted_bin_tallies)
    {
	if ($verbose eq 'Y' && 100*($i+1)/scalar(@sorted_bin_tallies) > ($previous_percentage + 1))
	{
	    print STDERR sprintf("%.0f",100*($i+1)/scalar(@sorted_bin_tallies)),"% of sites sorted.\n";
	    $previous_percentage += 1;
	}

	if ($sorted_bin_tallies[$i] != $previous_tally)
	{
	    $previous_tag_index = -1;
	}

	for my $j ($previous_tag_index+1..$#tagged_bin_tallies)
	{
	    if ($sorted_bin_tallies[$i] == $tagged_bin_tallies[$j])
	    {
		push @sorted_snp_array, $$snp_array[$j];
		$previous_tag_index = $j;
		last;
	    }
	}

	$previous_tally = $sorted_bin_tallies[$i];
    }

    @$snp_array = @sorted_snp_array;
}


# check to see if the bins tagged by one SNP are "equivalent" to the bins tagged
# by a second SNP, which must tag the same number or fewer bins; here,
# "equivalent" means that there is at least one population in which both SNPs
# are tagSNPs and that these SNPs tag the same bins in all such populations
sub AreBinLocationsEquivalent(\@\@)
{
    my $bin_locations_1 = shift;
    my $bin_locations_2 = shift;

    my $comparison_made = 0;    # boolean variable - were there any populations in which the
                                # two tagSNPs in question were both present?

    for my $i (0..$#$bin_locations_1)
    {
	next if ($$bin_locations_2[$i] == 0);

	$comparison_made = 1;

	return 0 if ($$bin_locations_1[$i] != $$bin_locations_2[$i]);
    }

    # if the two tagSNPs in question are not simultaneously present in any population, they
    # are not equivalent for the purposes of this analysis
    return $comparison_made;
}


# return the tagSNP that has the highest precedence in a given group of maximally
# informative SNPs (the input array contains one such group); if required SNPs are
# present, automatically choose the first one; if multiple tagSNPs tie, choose the
# one that has the best context score; if multiple tagSNPs still tie, choose the
# one that has the best site score; break any remaining ties by choosing the first
# tagSNP in the list
sub ChooseRepresentativeSNP(\@)
{
    my $max_inform_snps = shift;

    # degenerate case: if there is only one maximally informative SNP, it is
    # automatically chosen to be the representative
    return ($$max_inform_snps[0]) if (scalar(@$max_inform_snps) == 1);

    # if the previous return statement isn't triggered, the maximally informative set
    # contains more than one SNP; therefore, we need to choose among these SNPs
    my $best_context_score = 0;
    my @best_scoring_snps = ();

    foreach my $snp (@$max_inform_snps)
    {
	# if there is a required SNP among the maximally informative SNPs,
	# it is automatically selected
	if (exists($required_snps{$snp}))
	{
	    return $snp;
	}

	my $context_score = $tag_snp_info{$snp}->{context_score};

	if ($context_score > $best_context_score)
	{
	    $best_context_score = $context_score;
	    @best_scoring_snps = ();

	    push (@best_scoring_snps, $snp);
	}
	elsif ($context_score == $best_context_score)
	{
	    push (@best_scoring_snps, $snp);
	}
    }

    if ($scores ne "" && scalar(@best_scoring_snps) > 1)
    {
	my $best_site_score = 0;
	my @high_scoring_snps = ();

	foreach my $snp (@best_scoring_snps)
	{
	    my $site_score = $site_scores{$snp};

	    if ($site_score > $best_site_score)
	    {
		$best_site_score = $site_score;
		@high_scoring_snps = ();

		push @high_scoring_snps, $snp;
	    }
	    elsif ($site_score == $best_site_score)
	    {
		push @high_scoring_snps, $snp;
	    }
	}

	return $high_scoring_snps[0];
    }
    else
    {
	return $best_scoring_snps[0];
    }
}


# Algorithm Step 2a:
# Assemble maximally informative tagSNPs into a list, either in random order
# or ranked by some desirable qualities (like genomic and sequence context).
sub CompileRankedList(\@\@)
{
    my ($snp_list, $ranked_list) = @_;

    # direct each tagSNP to the proper sequence context array
    foreach my $snp (@$snp_list)
    {
	SendSNPToListSegment($snp);
    }

    # sort each sequence context array in order of increasing site scores
    SortArrayBySiteScore(@RF3_snps);
    SortArrayBySiteScore(@RI_snps);
    SortArrayBySiteScore(@RF5_snps);
    SortArrayBySiteScore(@RU_snps);
    SortArrayBySiteScore(@RS_snps);
    SortArrayBySiteScore(@RN_snps);
    SortArrayBySiteScore(@RT_snps);
    SortArrayBySiteScore(@UF3_snps);
    SortArrayBySiteScore(@UI_snps);
    SortArrayBySiteScore(@UF5_snps);
    SortArrayBySiteScore(@UU_snps);
    SortArrayBySiteScore(@US_snps);
    SortArrayBySiteScore(@UN_snps);
    SortArrayBySiteScore(@UT_snps);

    # consolidate sequence context arrays
    my @repeat_seq_snps = (@RF3_snps,@RI_snps,@RF5_snps,@RU_snps,@RS_snps,@RN_snps,@RT_snps);
    my @unique_seq_snps = (@UF3_snps,@UI_snps,@UF5_snps,@UU_snps,@US_snps,@UN_snps,@UT_snps);
    @$ranked_list = (@repeat_seq_snps, @unique_seq_snps);

    # This option is mostly for testing purposes; there's no particular
    # reason to randomize the list order in practice (it will obliterate
    # the previously established precedence hierarchy). If desired, however,
    # simply removing the comment below will have that effect.
    #RandomizeList(@$ranked_list);   
}


# assign a SNP to the proper list segment (a list segment is an array
# containing all SNPs with the same sequence and genomic context)
sub SendSNPToListSegment($)
{
    my $snp = shift;
    
    # default: send SNP to @RF3_snps array if it lacks sequence context info
    if (!exists($context_info{$snp}))
    {
	push @RF3_snps, $snp;
	return;
    }

    # (U)nique or (R)epetitive?
    my $sequence_context = $context_info{$snp}->{sequence};

    # (N)onsynonymous, (S)ynonymous, (U)ntranslated region, (F)lanking,
    # (I)ntron, (T) frameshift, or (X) other?
    my $genomic_context = $context_info{$snp}->{genomic};

    # assign the SNP to the proper list segment
    if ($sequence_context eq 'R')
    {
	if ($genomic_context eq 'F')
	{
	    # distinguish between 5'- and 3'-flanking SNPs
	    if (exists($context_info{$snp}->{location}) &&
		$context_info{$snp}->{location} eq '5-prime')
	    {
		push @RF5_snps, $snp;
	    }
	    else
	    {
		push @RF3_snps, $snp;
	    }
	}
	elsif ($genomic_context eq 'I')
	{
	    push @RI_snps, $snp;
	}
	elsif ($genomic_context eq 'U')
	{
	    push @RU_snps, $snp;
	}
	elsif ($genomic_context eq 'S')
	{
	    push @RS_snps, $snp;
	}
	elsif ($genomic_context eq 'N')
	{
	    push @RN_snps, $snp;
	}
	elsif ($genomic_context eq 'T')
	{
	    push @RT_snps, $snp;
	}
	else
	{
	    unshift @RF3_snps, $snp;
	}
    }
    elsif ($sequence_context eq 'U')
    {
	if ($genomic_context eq 'F')
	{
	    # distinguish between 5'- and 3'-flanking SNPs
	    if (exists($context_info{$snp}->{location}) &&
		$context_info{$snp}->{location} eq '5-prime')
	    {
		push @UF5_snps, $snp;
	    }
	    else
	    {
		push @UF3_snps, $snp;
	    }
	}
	elsif ($genomic_context eq 'I')
	{
	    push @UI_snps, $snp;
	}
	elsif ($genomic_context eq 'U')
	{
	    push @UU_snps, $snp;
	}
	elsif ($genomic_context eq 'S')
	{
	    push @US_snps, $snp;
	}
	elsif ($genomic_context eq 'N')
	{
	    push @UN_snps, $snp;
	}
	elsif ($genomic_context eq 'T')
	{
	    push @UT_snps, $snp;
	}
	else
	{
	    unshift @UF3_snps, $snp;
	}
    }
    else
    {
	unshift @RF3_snps, $snp;
    }
}


# sort the SNPs in the input array in order of increasing site scores
sub SortArrayBySiteScore(\@)
{
    my $snp_array = shift;

    return if (scalar(@$snp_array) == 0);

    my @site_score_array;     # parallels @$snp_array
    my @sorted_snp_array;

    # catalogue site scores for the relevant SNPs
    foreach my $snp (@$snp_array)
    {
	push @site_score_array, $site_scores{$snp};
    }

    # sort SNPs in order of increasing site scores
    my @sorted_site_scores = sort {$a <=> $b} @site_score_array;
    my $previous_score = 0;
    my $previous_tag_index = -1;

    for my $i (0..$#sorted_site_scores)
    {
	if ($sorted_site_scores[$i] != $previous_score)
	{
	    $previous_tag_index = -1;
	}

	for my $j ($previous_tag_index+1..$#site_score_array)
	{
	    if ($sorted_site_scores[$i] == $site_score_array[$j])
	    {
		push @sorted_snp_array, $$snp_array[$j];
		$previous_tag_index = $j;
		last;
	    }
	}

	$previous_score = $sorted_site_scores[$i];
    }

    @$snp_array = @sorted_snp_array;
}


# take an array, shuffle its elements into a new random order, and return the 
# randomized array
sub RandomizeList(\@)
{
    my $original_list = shift;
    my @temp_list = ();

    while (@$original_list)
    {
	push (@temp_list, splice(@$original_list, rand(@$original_list), 1));
    }

    @$original_list = @temp_list;
}


# Algorithm Step 2b:
# Traverse the tagSNP list, removing each SNP in turn. If no bin in any population
# loses representation, discard the SNP; otherwise, return it to the list. 
sub FindMinimalTagSNPSet(\@\@\@)
{
    my ($ranked_snps, $discarded_representative_snps, $near_minimal_set) = @_;
    my @ranked_snp_list = @$ranked_snps;
    my @covered_bins_repository = ();

    FillCoveredBinsRepository(@covered_bins_repository, @ranked_snp_list);

    @representative_snp_bin_repository = DeepCopy(@covered_bins_repository) if ($optimal eq 'Y');

    my $list_posn = 0;

    while ($list_posn <= $#ranked_snp_list)
    {
	# discard the current SNP if the bins it covers can be covered by other SNPs in the list
	if (IsSNPExpendable(@covered_bins_repository, $ranked_snp_list[$list_posn]))
	{
	    push @$discarded_representative_snps, $ranked_snp_list[$list_posn];
	    @ranked_snp_list = (@ranked_snp_list[0..$list_posn-1], @ranked_snp_list[$list_posn+1..$#ranked_snp_list]);
	}
	else   # leave the @ranked_snps list intact
	{
	    $list_posn++;
	}
    }

    print STDERR "Second algorithm step complete.\n" if ($verbose eq 'Y');

    @$near_minimal_set = @ranked_snp_list;
}


# fill a repository (first input array) with a list of arrays containing the
# bins captured in each population by a list of SNPs (second input array)
sub FillCoveredBinsRepository(\@\@)
{
    my ($repository, $snp_array) = @_;

    for my $i (0..$num_pops-1)
    {
	my @bins_covered = ();

	foreach my $snp (@$snp_array)
	{
	    my $bin = $tag_snp_info{$snp}->{bin_locations}->[$i];
	    next if ($bin == 0);

	    if (scalar(@bins_covered) < $bin)
	    {
		for my $j (scalar(@bins_covered)..$bin-1)
		{
		    $bins_covered[$j] = 0;
		}
	    }

	    $bins_covered[$bin-1]++;
	}

	push (@$repository, \@bins_covered);
    }
}


# the input array should contain a list of array references
sub DeepCopy(\@)
{
    my $array_to_copy = shift;
    my @return_array;

    foreach my $array_entry (@$array_to_copy)
    {
	my @array_entry_copy = @$array_entry;
	push @return_array, \@array_entry_copy;
    }

    return @return_array;
}


# can this SNP be discarded without causing any bin to lose representation?
sub IsSNPExpendable(\@$)
{
    my ($repository, $snp) = @_;

    return 0 if (exists($required_snps{$snp}));   # required SNPs are never expendable

    my @bins_lost = @{$tag_snp_info{$snp}->{bin_locations}};

    for my $i (0..$#bins_lost)
    {
	next if ($bins_lost[$i] == 0);
	return 0 if ($$repository[$i]->[$bins_lost[$i]-1] == 1);
    }

    # if this part of the function is reached, the queried SNP is expendable;
    # this being the case, we need to remove its entries from the covered bins repository
    for my $i (0..$#bins_lost)
    {
	next if ($bins_lost[$i] == 0);
	$$repository[$i]->[$bins_lost[$i]-1]--;
    }

    return 1;
}


# is the input element present in the input array?
sub IsElementInArray(\@$)
{
    my $array_ref = shift;
    my $array_entry = shift;

    foreach my $element (@$array_ref)
    {
	if ($element eq $array_entry)
	{
	    return 1;
	}
    }

    return 0;
}


# defaults to zero if the tagSNP has no sequence context info; note that
# the magnitude of a tagSNP's score does not affect the algorithm's decisions
# since it is concerned with RELATIVE scores only
sub ScoreTagSNP($)
{
    my $tag_snp = shift;
    my $score = 0;
    my $first_letter = '';
    my $second_letter = '';

    return $score if (!exists($context_info{$tag_snp}));

    # U(nique) or R(epetitive)?
    my $sequence_context = $context_info{$tag_snp}->{sequence};

    # N(onsynonymous), S(ynonymous), U(ntranslated region), F(lanking),
    # I(ntron), or frameshif(T)?
    my $genomic_context = $context_info{$tag_snp}->{genomic};

    # all SNPs in unique sequence receive higher scores than SNPs in
    # repetitive sequence
    if ($sequence_context eq 'U')
    {
	$score += 7;
    }

    if ($genomic_context eq 'F')
    {
	if (exists($context_info{$tag_snp}->{location}) &&
	    $context_info{$tag_snp}->{location} eq '5-prime')
	{
	    $score += 3;
	}
	else
	{
	    $score += 1;
	}
    }
    elsif ($genomic_context eq 'I')
    {
	$score += 2;
    }
    elsif ($genomic_context eq 'U')
    {
	$score += 4;
    }
    elsif ($genomic_context eq 'S')
    {
	$score += 5;
    }
    elsif ($genomic_context eq 'N')
    {
	$score += 6;
    }
    elsif ($genomic_context eq 'T')
    {
	$score += 7;
    }

    return $score;
}


# sort SNPs and attach context IDs (if present) and bin info
sub SortSelectedSNPs(\@)
{
    my $selected_snps = shift;
    my @sorted_snps;

    # sort SNPs (either numerically or alphabetically, as appropriate)
    if (scalar(@$selected_snps) > 0)
    {
	if ($$selected_snps[0] =~ /\D/)
	{
	    @sorted_snps = sort @$selected_snps;
	}
	else
	{
	    @sorted_snps = sort {$a <=> $b} @$selected_snps;
	}
    }

    foreach my $snp (@sorted_snps)
    {
	my @bin_locations = @{$tag_snp_info{$snp}->{bin_locations}};

	# append context info onto each SNP (if present)
	if (exists($context_info{$snp}))
	{
	    my $context_id = $context_info{$snp}->{sequence};
	    $context_id .= $context_info{$snp}->{genomic};

	    $snp = $context_id.'-'.$snp;
	}

	# append bin information onto each SNP
	$snp .= "-";

	for my $i (0..$#bin_locations)
	{
	    $snp .= "$bin_locations[$i]";
	    $snp .= "/" unless ($i == $#bin_locations);
	}
    }

    @$selected_snps = @sorted_snps;
}


# print program output
sub PrintResults(\@)
{
    my $final_snp_set = shift;

    print "num_tagSNPs: ", scalar(@$final_snp_set), "\n";
    print "Selected_tagSNPs: @$final_snp_set\n\n";
}


# parse the command line
sub ParseCommandLine()
{
    my $usage = "\nmultiPopTagSelect.pl\n";
    $usage .= "   [-ld_select]    file containing a list of ldSelect output file names\n";
    $usage .= "   <-context>      file containing genomic and sequence context info\n";
    $usage .= "   <-scores>       file containing SNP design scores\n";
    $usage .= "   <-excluded>     file containing a list of SNPs that must not be selected\n";
    $usage .= "   <-required>     file containing a list of SNPs that must be selected\n";
    $usage .= "   <-optimal>      seek a provably optimal solution? (y or n; default = n)\n";
    $usage .= "   <-verbose>      display program progress? (y or n; default = n)\n\n";

    # default values
    $arg{-ld_select} = "";
    $arg{-context} = "";
    $arg{-scores} = "";
    $arg{-excluded} = "";
    $arg{-required} = "";
    $arg{-optimal} = 'N';
    $arg{-verbose} = 'N';

    # parse the command line
    for (my $i = 0; $i <= $#ARGV; $i++)
    {
	if ($ARGV[$i] =~ /^-/)
	{
	    $arg{$ARGV[$i]} = $ARGV[$i+1];
	}
    }

    $arg{-optimal} = uc($arg{-optimal});
    $arg{-verbose} = uc($arg{-verbose});

    die ($usage) if (!$arg{-ld_select});
    die ($usage) if ($arg{-optimal} ne 'N' && $arg{-optimal} ne 'Y');
    die ($usage) if ($arg{-verbose} ne 'N' && $arg{-verbose} ne 'Y');
}


###################################################################
#                 OPTIMAL SNP SELECTION FUNCTIONS                 #
###################################################################

# check to see whether the current solution is truly minimal; if not,
# replace it with an optimal solution
sub AreDiscardedSNPsOptimal(\@)
{
    my $best_current_solution = shift;
    my $are_discarded_snps_optimal = 1;    # boolean return variable
    my @discarded_snp_clusters;
    my %distinct_overlapping_snps;
    my (@redundant_selected_snps, @unique_selected_snps);

    # looking back to the algorithm's second step, determine which selected SNPs
    # uniquely tagged a bin (note that these SNPs are a superset of "mandatory" SNPs)
    # and which SNPs could have been discarded under at least one possible list order;
    # refer to the first group as "unique" SNPs and the second as "redundant" SNPs
    foreach my $selected_snp (@$best_current_solution)
    {
	if (IsSNPEverExpendable(@representative_snp_bin_repository, $selected_snp))
	{
	    push @redundant_selected_snps, $selected_snp;
	}
	else
	{
	    push @unique_selected_snps, $selected_snp;
	}
    }

    return 1 if (scalar(@redundant_selected_snps) == 0);

    # identify (redundant) selected SNPs that tag some of the same bins as SNPs that
    # were discarded in the second step of the algorithm; refer to these selected SNPs
    # (and the corresponding discarded SNPs) as "overlapping" since their bin coverage
    # vectors "overlap" in the sense of sharing at least one (non-zero) element

    # this hash keeps track of which discarded SNPs are "singly connected"; each such
    # SNP overlaps with only one redundant selected SNP
    my %is_snp_singly_connected;

    foreach my $discarded_snp (@discarded_snps)
    {
	my (@overlapping_selected_snps, @other_snps);
	@other_snps = @unique_selected_snps;

	foreach my $selected_snp (@redundant_selected_snps)
	{
	    if (DoesBinCoverageOverlap($selected_snp, $discarded_snp))
	    {
		push @overlapping_selected_snps, $selected_snp;
	    }
	    else
	    {
		push @other_snps, $selected_snp;
	    }
	}

	if (scalar(@overlapping_selected_snps) == 1)
	{
	    $is_snp_singly_connected{$discarded_snp} = 1;
	}
	else
	{
	    $discarded_snp_info{$discarded_snp}->{overlap} = \@overlapping_selected_snps;
	    $discarded_snp_info{$discarded_snp}->{other} = \@other_snps;
	}
    }

    # cluster discarded SNPs based on shared overlapping selected SNPs
    my %is_snp_clustered;

    for my $i (0..$#discarded_snps)
    {
	next if ($is_snp_singly_connected{$discarded_snps[$i]});
	next if ($is_snp_clustered{$discarded_snps[$i]});

	my @current_overlaps = @{$discarded_snp_info{$discarded_snps[$i]}->{overlap}};

	if (scalar(@current_overlaps) == 0)
	{
	    $is_snp_clustered{$discarded_snps[$i]} = 1;
	    next;
	}

	my @current_cluster = ($discarded_snps[$i]);
	$is_snp_clustered{$discarded_snps[$i]} = 1;

	my $sentinel = $discarded_snps[$i];
	my $list_posn = $i;

	do
	{
	    if (!exists($is_snp_clustered{$discarded_snps[$list_posn]}) &&
		!exists($is_snp_singly_connected{$discarded_snps[$list_posn]}))
	    {
		my @overlaps = @{$discarded_snp_info{$discarded_snps[$list_posn]}->{overlap}};

		foreach my $overlap_snp (@overlaps)
		{
		    if (IsElementInArray(@current_overlaps, $overlap_snp))
		    {
			push @current_cluster, $discarded_snps[$list_posn];
			$is_snp_clustered{$discarded_snps[$list_posn]} = 1;
			$sentinel = $discarded_snps[$list_posn];

			foreach my $overlap (@overlaps)
			{
			    push @current_overlaps, $overlap if (!IsElementInArray(@current_overlaps, $overlap));
			}

			last;
		    }
		}
	    }

	    $list_posn++;
	    $list_posn = 0 if ($list_posn > $#discarded_snps);

	} while ($discarded_snps[$list_posn] ne $sentinel);

	push @discarded_snp_clusters, \@current_cluster;
	$distinct_overlapping_snps{$discarded_snps[$i]} = \@current_overlaps;

# ---------------------------------------------------------------------------------
# For debugging:
# ---------------------------------------------------------------------------------
#	print STDERR "|@current_cluster| - |@current_overlaps|\n";
#
#	foreach my $discard (@current_cluster)
#	{
#	    print STDERR "|$discard| (@{$tag_snp_info{$discard}->{bin_locations}}):\n";
#
#	    foreach my $overlap (@current_overlaps)
#	    {
#		print STDERR "   |$overlap| (@{$tag_snp_info{$overlap}->{bin_locations}})\n";
#	    }
#
#	    print STDERR "\n";
#	}
# ---------------------------------------------------------------------------------
    }

    my %snp_removal_templates;
    CreateSNPRemovalTemplates(@discarded_snp_clusters, %distinct_overlapping_snps,
			      %snp_removal_templates);

    my %was_selected_snp_replaced;   # keeps track of selected SNPs replaced by discarded SNPs
    my @repatriated_discards;

    CLUSTER: foreach my $discarded_snp_cluster (@discarded_snp_clusters)
    {
	my @distinct_overlapping_snps = @{$distinct_overlapping_snps{$$discarded_snp_cluster[0]}};
	my @distinct_other_snps;
	my @discard_removal_templates = @{$snp_removal_templates{scalar(@$discarded_snp_cluster)}};
	my @overlap_removal_templates = @{$snp_removal_templates{scalar(@distinct_overlapping_snps)}};

	# define @distinct_other_snps as the intersection of the "other SNPs" for all discarded
	# SNPs in the current cluster; we will not try to replace the @distinct_other_SNPs this
	# time through the CLUSTER loop
	foreach my $other_snp (@{$discarded_snp_info{$$discarded_snp_cluster[0]}->{other}})
	{
	    next if ($was_selected_snp_replaced{$other_snp});

	    my $is_always_other = 1;   # boolean variable - is this SNP an "other SNP" for
	                               # every discarded SNP in this cluster?

	    for my $i (1..$#$discarded_snp_cluster)
	    {
		my @overlap_snps = @{$discarded_snp_info{$$discarded_snp_cluster[$i]}->{overlap}};

		if (IsElementInArray(@overlap_snps, $other_snp))
		{
		    $is_always_other = 0;
		    last;
		}
	    }

	    push @distinct_other_snps, $other_snp if ($is_always_other);
	}

	my @smallest_snp_sets;
	my @replaced_selected_snps = ();
	my @repatriated_discarded_snps = ();
	my $was_substitution_made = 0;           # boolean variable

	my @current_snp_set = @$best_current_solution;
	push @smallest_snp_sets, \@current_snp_set;

	foreach my $discard_template (@discard_removal_templates)
	{
	    foreach my $overlap_template (@overlap_removal_templates)
	    {
		next if (!AreTemplatesRelevant(@$discard_template, @$overlap_template));

		# @residual_snps are SNPs that remain after purging the SNPs dictated by the
		# template instructions; i.e., they include all @distinct_other_snps and subsets
		# of the discarded SNPs and overlapping selected SNPs in the current cluster
		my (@residual_snps, @purged_snps, @replacement_snps, @replaced_snps);

		# execute the template instructions for the discarded SNPs in the current
		# cluster; here, @replacement_snps are those that we will attempt to
		# substitute for redundant selected SNPs
		for my $i (0..$#$discard_template)
		{
		    if ($$discard_template[$i] eq '1')
		    {
			push @residual_snps, $$discarded_snp_cluster[$i];
			push @replacement_snps, $$discarded_snp_cluster[$i];
		    }
		}

		# execute the template instructions for the overlapping selected SNPs
		# corresponding to the current cluster; here, @replaced_snps are those
		# that will be deemed unnecessary if the proposed substition leads to a
		# more parsimonious solution
		for my $i (0..$#$overlap_template)
		{
		    if ($$overlap_template[$i] eq '1')
		    {
			push @residual_snps, $distinct_overlapping_snps[$i];
		    }
		    else
		    {
			push @replaced_snps, $distinct_overlapping_snps[$i];
		    }
		}

		push @residual_snps, @distinct_other_snps;
		push @residual_snps, @repatriated_discards;

		# if the @residual_snps capture all of the bins lost through eliminating
		# the @replaced_snps, they represent a valid alternative to the original
		# selected SNP set
		if (DoResidualSNPsAccountForAllBins(@residual_snps, @replaced_snps))
		{
#		    print STDERR "Replacement: |@replacement_snps|\n";
#		    print STDERR "Replaced: |@replaced_snps|\n";

		    # if the new SNP set is smaller than any existing ones, store it as
		    # the uniquely smallest set thus far; if it is the same size as
		    # existing sets, store it as an equivalent alternative
		    if (scalar(@residual_snps) < scalar(@{$smallest_snp_sets[0]}))
		    {
			$are_discarded_snps_optimal = 0;
			$was_substitution_made = 1;

			@smallest_snp_sets = (\@residual_snps);
			@replaced_selected_snps = (\@replaced_snps);
			@repatriated_discarded_snps = (\@replacement_snps);
		    }
#----------------------------------------------------------------------------------
# NOTE: this conditional clause might prove useful later if we want to
# choose between minimal sets of the same size based on additional criteria
#		    elsif (scalar(@residual_snps) == scalar(@{$smallest_snp_sets[0]}))
#		    {
#			push @smallest_snp_sets, \@residual_snps;
#			push @replaced_selected_snps, \@replaced_snps;
#		    }
#----------------------------------------------------------------------------------
		}
	    }
	}

	if ($was_substitution_made)
	{
	    @$best_current_solution = @{$smallest_snp_sets[0]};

	    foreach my $replaced_snp (@{$replaced_selected_snps[0]})
	    {
		$was_selected_snp_replaced{$replaced_snp} = 1;
	    }

	    push @repatriated_discards, @{$repatriated_discarded_snps[0]};
	}
    }

    return $are_discarded_snps_optimal;
}


# could this SNP be discarded from any possible list?
sub IsSNPEverExpendable(\@$)
{
    my ($repository, $snp) = @_;

    return 0 if (exists($required_snps{$snp}));   # required SNPs are never discardable

    my @bins_lost = @{$tag_snp_info{$snp}->{bin_locations}};

    for my $i (0..$#bins_lost)
    {
	next if ($bins_lost[$i] == 0);
	return 0 if ($$repository[$i]->[$bins_lost[$i]-1] == 1);
    }

    # if this part of the function is reached, the queried SNP is expendable
    # in at least one list order
    return 1;
}


# do these two SNPs tag any of the same bins?
sub DoesBinCoverageOverlap($$)
{
    my ($snp_1, $snp_2) = @_;

    my @snp_1_bins = @{$tag_snp_info{$snp_1}->{bin_locations}};
    my @snp_2_bins = @{$tag_snp_info{$snp_2}->{bin_locations}};

    for my $i (0..$#snp_1_bins)
    {
	if ($snp_1_bins[$i] != 0 && $snp_1_bins[$i] == $snp_2_bins[$i])
	{
	    return 1;
	}
    }

    return 0;
}


#
sub AreArraysIdentical(\@\@)
{
    my ($array_1, $array_2) = @_;

    return 0 if (scalar(@$array_1) != scalar(@$array_2));

    for my $i (0..$#$array_1)
    {
	return 0 if ($$array_1[$i] ne $$array_2[$i]);
    }

    return 1;
}


# make a set of template arrays that dictate relevant SNP removal
# patterns (a pattern is not relevant if we can determine, a priori,
# that it will not improve our situation)
sub CreateSNPRemovalTemplates(\@\%\%)
{
    my ($discard_clusters, $distinct_overlaps, $removal_templates) = @_;
    my @relevant_templates;

    # create SNP removal templates for discarded SNP clusters
    foreach my $discard_cluster (@$discard_clusters)
    {
	next if (exists $$removal_templates{scalar(@$discard_cluster)});

	my @composite_array;

	for my $i (1..scalar(@$discard_cluster))
	{
	    my @binary_array = (0, 1);
	    push @composite_array, \@binary_array;
	}

	my @removal_templates = AssemblePatternArrays(@composite_array);
	$$removal_templates{scalar(@$discard_cluster)} = \@removal_templates;
    }

    # create SNP removal templates for the union of selected SNPs that overlap
    # each discarded SNP cluster
    foreach my $discard_cluster (@$discard_clusters)
    {
	my @distinct_overlapping_snps = @{$$distinct_overlaps{$$discard_cluster[0]}};

	next if (exists $$removal_templates{scalar(@distinct_overlapping_snps)});

	my @composite_array;

	for my $i (1..scalar(@distinct_overlapping_snps))
	{
	    my @binary_array = (0, 1);
	    push @composite_array, \@binary_array;
	}

	my @removal_templates = AssemblePatternArrays(@composite_array);
	$$removal_templates{scalar(@distinct_overlapping_snps)} = \@removal_templates;
    }
}


#
sub AssemblePatternArrays(\@)
{
    my $composite_array = shift;
    my @pattern_arrays;

    my @first_array = @{$$composite_array[0]};

    for my $i (0..$#first_array)
    {
	$pattern_arrays[$i][0] = $first_array[$i];
    }

    for my $i (1..$#$composite_array)
    {
	my @current_array = @{$$composite_array[$i]};
	my $num_elements = scalar(@current_array);

	ReplicatePatternArrays(@pattern_arrays, $num_elements-1);

	my $index = 0;

	foreach my $element (@current_array)
	{
	    for my $j (1..scalar(@pattern_arrays)/$num_elements)
	    {
		$pattern_arrays[$index][$i] = $element;
		$index++;
	    }
	}
    }

# --------------------------------------------------------
# For debugging:
# --------------------------------------------------------
#    for my $i (0..$#pattern_arrays)
#    {
#	for my $j (0..$#{$pattern_arrays[$i]})
#	{
#	    print STDERR "$pattern_arrays[$i][$j] ";
#	}
#
#	print STDERR "\n";
#    }
# --------------------------------------------------------

    return @pattern_arrays;
}


#
sub ReplicatePatternArrays(\@$)
{
    my ($pattern_arrays, $num_reps) = @_;

    my @base_array = @$pattern_arrays;
    my $next_index = $#$pattern_arrays + 1;

    for my $i (1..$num_reps)
    {
	for my $j (0..$#base_array)
	{
	    @{$$pattern_arrays[$next_index]} = @{$base_array[$j]};
	    $next_index++;
	}
    }
}


# to improve the solution, N discarded SNPs must replace at least
# N+1 selected SNPs
sub AreTemplatesRelevant(\@\@)
{
    my ($discard_template, $overlap_template) = @_;
    my $num_discards_retained = 0;
    my $num_overlaps_eliminated = 0;

    foreach my $element (@$discard_template)
    {
	$num_discards_retained++ if ($element eq '1');
    }

    foreach my $element (@$overlap_template)
    {
	$num_overlaps_eliminated++ if ($element eq '0');
    }

    if ($num_discards_retained > 0 && $num_discards_retained < $num_overlaps_eliminated)
    {
	return 1;
    }
    else
    {
	return 0;
    }
}


#
sub DoResidualSNPsAccountForAllBins(\@\@)
{
    my ($residual_snps, $purged_snps) = @_;
    my (%residual_bin_indicators, %purged_bin_indicators);

    # assign %residual_bin_indicators
    foreach my $residual_snp (@$residual_snps)
    {
	my @bins = @{$tag_snp_info{$residual_snp}->{bin_locations}};

	for my $i (0..$num_pops-1)
	{
	    $residual_bin_indicators{$i}->{$bins[$i]} = 1;
	}
    }

    # assign %purged_bin_indicators
    foreach my $purged_snp (@$purged_snps)
    {
	my @bins = @{$tag_snp_info{$purged_snp}->{bin_locations}};

	for my $i (0..$num_pops-1)
	{
	    $purged_bin_indicators{$i}->{$bins[$i]} = 1;
	}
    }

    # determine whether all bins that were purged are still represented
    # by the residual SNPs
    for my $i (0..$num_pops-1)
    {
	foreach my $purged_bin (keys %{$purged_bin_indicators{$i}})
	{
	    if (!exists($residual_bin_indicators{$i}->{$purged_bin}))
	    {
		return 0;
	    }
	}
    }

    return 1;
}
