import random
import string

import sys
import tools
from tools import pp

# the representation for a point is:
#
#    (name, value)
#
# the rep for a cluster is
#
#    ([name1, name2...], mean_value, size)
#

NAME = 0 # the name of read or list of names of cluster
VALUE = 1 # the mean vector of the read or cluster
SIZE = 2 # the number of points

############################################################
#
#                POINTS IN EUCLIDIAN SPACE
#
############################################################

def euclidian_distance(x1,x2):
    # calculates the euclidian distance between two sample points
    # dist = Sum_i( (x1[i] - x2[i])^2 )
    return tools.sum(map(lambda x: x*x,
                         map(lambda o1x,o2x: (o1x-o2x), 
                             x1, x2)))

def weighted_average(xs, ws):
    # average the vectors xs=[x1,x2,x3...], with weights ws=[w1,w2,w3...]
    # result[i] = 1/(w1+w2+w3+...) * (x1[i]*w1 + x2[i]*w2 + x3[i]*w3 + ...)

    # requires: all vectors x1,x2,x3... are same length
    #           len(xs) = len(ws)
    # returns mean of length = len(x1) = len(x2) = ...
    mean = [0]*len(xs[0])
    sum_w = tools.sum(ws)
    for i in range(0, len(mean)):
        sum_i = 0
        for x,w in map(None,xs,ws):
            sum_i = sum_i + x[i] * w
        mean[i] = sum_i/sum_w
    return mean

def points2cluster(points):
    # from (name, value)
    # to ([name], value)
    return (map(lambda p: p[NAME],points),
            weighted_average(map(lambda p: p[VALUE],points),[1]*len(points)),
            len(points))

############################################################
#
#                N-MERS IN STRING SPACE
#
############################################################

def create_empty_profile(length):
    """ Create an empty profile (a cluster with no points assigned) """
    profile = []
    for i in range(0,length):
        position = {}
        tools.mset(position,'ACGT',[0,0,0,0])
        profile.append(position)
    return profile

def strings2profile(strings):
    """ Creates a profile from a set of strings.

    Assumes that all strings are same length"""
    if not strings: return None
    
    profile = create_empty_profile(len(strings[0]))
    for s in strings:
        # add every string to the profile
        for i in range(0,len(s)):
            char = s[i]
            if char in 'ACGT':
                # if the character is unambiguous, add unit to profile
                profile[i][char] = profile[i][char] + 1
            else:
                # if the character is ambiguous, add increments
                chars = tools.IUB_expansion(char)
                increment = 1.0/float(len(chars))
                for char in chars:
                    profile[i][char] = profile[i][char] + increment
    return profile

def distance_string_profile(s,p):
    """ Evaluates the distance from a string to a profile """
    if not p: return len(s)*.75
    
    # the number of points added to the cluster
    tot_p = tools.sum(tools.mget(p[0],'ACGT'))
    # the intersections of this particular string
    score = 0
    for i in range(0,len(s)):
        char = s[i]
        if char in 'ACGT':
            score = score + p[i][char]
        else:
            chars = tools.IUB_expansion(char)
            increment = 1.0/float(len(chars))
            for char in chars:
                score = score + increment
    distance = (tot_p*len(s))-score
    #print "Distance from %s to %s is %s"%(s,display_profile(p),distance)
    return distance

def display_profile(profile):
    if not profile: return '[](0)'
    tot_p = tools.sum(tools.mget(profile[0],'ACGT'))
    s = ''
    for pos in profile:
        s = s+'[%s]'%string.join(map(lambda s: s[0],
                                          filter(lambda s: s[1], pos.items())),'')
    s = s+'(%s)'%tot_p
    return s

######## More operations, not needed for k-means
#
#def update_profile(profile, string):
#    """ Adding a string into a profile (adding a point to a cluster) """
#    if len(profile)!=len(string): raise "Inconsistent lengths", (len(profile),len(string))
#    for i in range(0,len(string)):
#        char = string[i]
#        profile[i][char] = profile[i][char] + 1
#
#def string2profile(string):
#    """ Transforming a string into a profile (point -> cluster). """
#    if type(string) == type(''): 
#        profile = create_empty_profile(len(string))
#        update_profile(profile,string)
#        return profile
#    else:
#        return string
#
#def distance_string(p1,p2):
#    """ Distance in string space.  Works for either profiles or strings. """
#    # s1 and s2 can be either strings or profiles
#    if type(p1) == type(''): p1 = string2profile(p1)
#    if type(p2) == type(''): p2 = string2profile(p2)
#    return distance_profiles(p1,p2)

def distance_profiles(p1,p2):
    """ Distance between two profiles """
    if len(p1)!=len(p2): raise "Uncomparable lengths", (p1,p2)

    tot_p1 = float(tools.sum(tools.mget(p1[0],'ACGT')))
    tot_p2 = float(tools.sum(tools.mget(p2[0],'ACGT')))
    
    matches = []
    for pos1,pos2 in map(None,p1,p2):
        match = 0
        for char in 'ACGT':
            match = match + (pos1[char]/tot_p1) * (pos2[char]/tot_p2)
        matches.append(match)
    return tools.sum(matches)/len(matches)

def profile_sum(profiles):
    """ Joining two clusters.  Returns one which contains all points in either. """
    pp(profiles)
    length = len(profiles[0])
    for profile in profiles:
        if len(profile)!=length: raise "Incompatible lengths", map(len, profiles)
    new_p = create_empty_profile(length)
    for i in range(0,length):
        for char in 'ACGT':
            for profile in profiles: 
                new_p[i][char] = new_p[i][char] + profile[i][char]
    return new_p

#def profile_pairwise_sum(p1,p2):
#
#    if len(p1)!=len(p2): raise "Incompatible lengths", (len(p1),len(p2))
#    new_p = create_empty_profile(len(p1))
#    for i in range(0,len(p1)):
#        for char in 'ACGT':
#            new_p[i][char] = p1[i][char] + p2[i][char]
#    return new_p
###########################################

def string_points2cluster(points):
    return [map(lambda p: p[NAME],points),
            strings2profile(map(lambda p: p[VALUE],points)),
            len(points)]

def k_means_string(sequences, n, k):
    points = gather_kmers_names(sequences,n)
    
    clusters = k_means(points, k,
                       distance_func=distance_string_profile,
                       #string_points2cluster,
                       ps2c_func=strings2profile,
                       display_cluster=display_profile)

    clusters = filter(lambda c: c[SIZE], tools.my_sort(clusters,len))
    pp(clusters,2,60)
    print string.join(map(lambda c: display_profile(c[VALUE]),clusters),'\n')
    return clusters

def hierarchical_string(sequences, n):
    points = gather_kmers_names(sequences,n)

    clusters = hierarchical(points, .01,
                            distance_cluster_cluster=distance_profiles,
                            join_clusters=profile_sum,
                            ps2c_func=strings2profile,
                            display_cluster=display_profile)


#def spoint2scluster(spoint):
#    return [[spoint[NAME]],strings2profile(spoint[VALUE]),1]

#def average_profiles(profiles, scaling):
#    return profile_sum(profiles)
#
#def gather_kmers(sequences, k):
#    kmers = []
#    for seq in sequences:
#        for i in range(0,len(seq)-k+1):
#            kmers.append(seq[i:][:k])
#    return kmers

def gather_kmers_names(sequences, k):
    kmers,names = [],[]
    for seq,s in map(None,sequences,range(0,len(sequences))):
        for i in range(0,len(seq)-k+1):
            kmers.append(seq[i:][:k])
            names.append((s,i))
    return map(None,names,kmers)

############################################################
#
#                DIFFERENT ALGORITHMS
#
############################################################

def hierarchical_optimized(points,
                           distance_cluster_clutser=euclidian_distance,
                           join_clusters=weighted_average,
                           ps2c_func=points2cluster,
                           display_cluster=pp):

    # first find the max distance between any 2 points
    # only needed to initiate the min distance between every iteration

    print "Converting every point to a cluster"
    # first, converts every data point into a cluster
    clusters = map(lambda p,ps2c=ps2c_func: [[p[NAME]],ps2c([p[VALUE]]),1], points)

    print "Calculating all pairwise distances"
    distances = []
    for i in range(0,len(clusters)):
        for j in range(0,len(clusters)):
            if j<=i: continue
            dist = distance_cluster_cluster(clusters[i][VALUE],
                                            clusters[j][VALUE])
            distances.append((i, j, dist))
    print 'Done calculating %s distances'%len(distances)

    distances.sort(lambda d1,d2: d2[2]-d1[2])
    print 'Done sorting %s distances'%len(distances)

    print "Max distance is %s"%distances[0]

    if max_dist < max_join:
        # if you can't make any joins, then simply return all points,
        # each as a different cluster
        return clusters

    # transform list into dictonary so that indices are never reused
    clusters = tools.list2dic_i(clusters)
    n_clusters = len(clusters)

    # at every iteration, joins the two closest clusters
    # replaces them by the mean cluster
    while 1:
        
        print string.join(map(display_cluster,
                              map(lambda c: c[VALUE],
                                  filter(lambda c: c[SIZE]>1,clusters.values()))),'\n')
        
        # initiate to max distance, to make sure we don't miss anything
        min_dist, min_pair = max_dist, None
        for i in clusters.keys():
            for j in clusters.keys():
                if j<=i: continue
                dist = distance_cluster_cluster(clusters[i][VALUE], clusters[j][VALUE])
                if dist < min_dist:
                    min_dist = dist
                    min_pair = (i,j)

        #print "Min dist = %s for %s"%(min_dist, min_pair)

        # if the join to be made is greater then the max allowed join,
        # then stop right there
        if (not min_pair) or min_dist > max_join or len(clusters)==1: break
        
        # replace the min pair with a weighted average of their averages
        # where the weights are the number of points already contained
        # in each cluster.
        i,j = min_pair
        new_mean = join_clusters([clusters[i][VALUE],
                                  clusters[j][VALUE]])
        #new_name = (clusters[i][NAME], clusters[j][NAME])
        new_name = tools.flatten([clusters[i][NAME], clusters[j][NAME]])
        new_size = clusters[i][SIZE] + clusters[j][SIZE]
        clusters[n_clusters] = (new_name, new_mean, new_size)
        n_clusters = n_clusters + 1
        del(clusters[i])
        del(clusters[j])
    cluster_list = tools.dic2list(clusters)
    cluster_list.sort(lambda c1,c2: c2[SIZE]-c1[SIZE])
    return cluster_list

def hierarchical(points, max_join,
                 distance_cluster_cluster=euclidian_distance,
                 join_clusters=weighted_average,
                 ps2c_func=points2cluster,
                 display_cluster=pp):

    # first find the max distance between any 2 points
    # only needed to initiate the min distance between every iteration

    print "Converting every point to a cluster"
    # first, converts every data point into a cluster
    clusters = map(lambda p,ps2c=ps2c_func: [[p[NAME]],ps2c([p[VALUE]]),1], points)

    print "Calculating all pairwise distances"
    max_dist = 0
    for i in range(0,len(clusters)):
        for j in range(0,len(clusters)):
            if j<=i: continue
            dist = distance_cluster_cluster(clusters[i][VALUE],
                                            clusters[j][VALUE])
            if dist > max_dist: max_dist = dist
    print "Max distance is %s"%max_dist
    if max_dist < max_join:
        # if you can't make any joins, then simply return all points,
        # each as a different cluster
        return clusters

    # transform list into dictonary so that indices are never reused
    clusters = tools.list2dic_i(clusters)
    n_clusters = len(clusters)

    # at every iteration, joins the two closest clusters
    # replaces them by the mean cluster
    while 1:
        
        print string.join(map(display_cluster,
                              map(lambda c: c[VALUE],
                                  filter(lambda c: c[SIZE]>1,clusters.values()))),'\n')
        
        # initiate to max distance, to make sure we don't miss anything
        min_dist, min_pair = max_dist, None
        for i in clusters.keys():
            for j in clusters.keys():
                if j<=i: continue
                dist = distance_cluster_cluster(clusters[i][VALUE], clusters[j][VALUE])
                if dist < min_dist:
                    min_dist = dist
                    min_pair = (i,j)

        #print "Min dist = %s for %s"%(min_dist, min_pair)

        # if the join to be made is greater then the max allowed join,
        # then stop right there
        if (not min_pair) or min_dist > max_join or len(clusters)==1: break
        
        # replace the min pair with a weighted average of their averages
        # where the weights are the number of points already contained
        # in each cluster.
        i,j = min_pair
        new_mean = join_clusters([clusters[i][VALUE],
                                  clusters[j][VALUE]])
        #new_name = (clusters[i][NAME], clusters[j][NAME])
        new_name = tools.flatten([clusters[i][NAME], clusters[j][NAME]])
        new_size = clusters[i][SIZE] + clusters[j][SIZE]
        clusters[n_clusters] = (new_name, new_mean, new_size)
        n_clusters = n_clusters + 1
        del(clusters[i])
        del(clusters[j])
    cluster_list = tools.dic2list(clusters)
    cluster_list.sort(lambda c1,c2: c2[SIZE]-c1[SIZE])
    return cluster_list

def make_k_means_func(k):
    return lambda points,k=k: k_means(points,k)

def k_means(points, k,
            distance_func=euclidian_distance,
            ps2c_func=points2cluster,
            display_cluster=pp):

    # create a dictionary mapping point names to values
    point_dic = {}
    tools.mset(point_dic,
               map(lambda p: p[NAME], points),
               map(lambda p: p[VALUE], points))

    # pick k non-identical centers
    if k>len(points): raise "More centers than data points!",(k,len(points))
    centers = []
    while len(centers) < k: 
        new_center = random.choice(points)
        # check that the new center is not on an old one
        # otherwise algorithm will not converge
        no_good = 0
        for old_center in centers:
            # distance from point to cluster
            if distance_func(new_center[VALUE], old_center[VALUE])==0:
                no_good = 1
                break
        # if it intersects a previous center, check for another center
        if no_good: continue
        centers.append([[new_center[NAME]],
                        ps2c_func([new_center[VALUE]]),
                        1])

    # now do the iteration
    iter = 0
    while 1:

        print "*"*40

        if 1:
            iter = iter+1
            print "Iteration %s"%iter
            #pp(map(lambda c: (c[SIZE],(map(lambda d: '%s'%d, c[VALUE])),
            #                  string.join(map(lambda s: '%s'%s,c[NAME])[:20],',')), centers),2)
            #pp(centers,1)

        print string.join(map(display_cluster,map(lambda c: c[VALUE],centers)),'\n')
    
        # save the old point assignments
        old_names = map(lambda center: center[NAME], centers)

        # 
        # 1. assign each point to a center
        #

        # first empty the assignments of each center
        for center in centers:
            center[NAME] = []
            center[SIZE] = 0

        # now assign the points
        for point in points:
            # find the best center
            min_dist = distance_func(point[VALUE], centers[0][VALUE])
            closest_center = 0
            for c in range(0,len(centers)):
                dist = distance_func(point[VALUE], centers[c][VALUE])
                if dist < min_dist:
                    min_dist = dist
                    closest_center = c

            # assing the point to that center
            #print 'Assigning %s to %s with score %s'%(
            #    point[VALUE],
            #    display_cluster(centers[closest_center][VALUE]),
            #    min_dist)
            centers[closest_center][NAME].append(point[NAME])
            centers[closest_center][SIZE] = centers[closest_center][SIZE]+1

        #
        # 2. reevaluate each center
        #
        for center in centers:

            # one could turn this into EM by changing the weighting
            # of each point into the probability of that point
            # belonging to that center
            center[VALUE] = ps2c_func(tools.mget(point_dic,center[NAME]))
            
        #
        # to decide if you should stop, evaluate the spread
        # or simply measure how much each center is moving each time
        #

        new_names = map(lambda center: center[NAME], centers)

        additions    = map(lambda new,old: tools.set_subtract(new,old),new_names,old_names)
        subtractions = map(lambda new,old: tools.set_subtract(old,new),new_names,old_names)

        if iter>2: #tools.sum(distances) <= .01*len(distances) or iter > 10:
            break
        else:
            print "Before:       "+string.join(map(lambda add: '%2s'%len(add),old_names))
            print "Subtractions: "+string.join(map(lambda add: '%2s'%len(add),subtractions))
            print "Additions:    "+string.join(map(lambda add: '%2s'%len(add),additions))
            print "After:        "+string.join(map(lambda add: '%2s'%len(add),new_names))

    centers.sort(lambda c1,c2: c2[SIZE]-c1[SIZE])
    return map(lambda c: (c[NAME],c[VALUE],c[SIZE]), centers)

############################################################
#
#                NORMALIZATION
#
############################################################

def mean0_stdev1(points):
    new_points = []
    for point in points:

        mean = tools.avg(point[VALUE])
        stdev = tools.stdev(point[VALUE])

        new_value = map(lambda x,m=mean,s=stdev: (x-m)/s, point[VALUE])

        #print "Old mean: %s, old stdev: %s"%(mean,stdev)
        #print "New mean: %s, new stdev: %s"%(tools.avg(new_value),tools.stdev(new_value))
        
        new_point = (point[NAME], new_value)

        new_points.append(new_point)

    return new_points

def restore_points(clusters, old_points):
    # the clustering was done using normalized data points,
    # hence the cluster values do not really correspond to anything
    # meaningful.  However, the names of the points are correct. 
    # 
    # here, we restore the original values of the points, based on
    # their names, and we recompute the cluster value as the average
    # of the unnormalized values the names correspond to 


    # make a lookup table for the original values
    point_dic = {}
    tools.mset(point_dic,
               map(lambda p: p[NAME], old_points),
               map(lambda p: p[VALUE], old_points))

    # construct the new clusters
    new_clusters = []
    for cluster in clusters:
        restored_values = []
        for name in cluster[NAME]:
            restored_values.append(point_dic[name])

        new_cluster = (cluster[NAME],
                       weighted_average(restored_values,
                                        [1]*len(restored_values)),
                       cluster[SIZE])

        new_clusters.append(new_cluster)

def normalize_and_cluster(clustering_function, normalization_function, points):

    # normalize points
    new_points = normalization_function(points)

    # compute cluster based on normalized points
    clusters = clustering_function(new_points)

    # recompute cluster mean based on original value
    restore_points(clusters, points)

    return clusters

############################################################
#
#  CLUSTERING in ANY SPACE
#
############################################################
        
def follow_references(references, clusters, i, debug=0):
    if debug: sys.stdout.write('Looking for %s'%i)
    while 1:
        if clusters.has_key(i): break
        i = references[i]
        if debug: sys.stdout.write('->%s'%i)
    if debug: sys.stdout.write(' Found!\n')
    return i


## def generic_clustering(points, distances, threshold, points2cluster, cluster2points, compare_clusters, join_clusters, debug=0):

##     # distances is a list:  [(i,j,dist_ij),
##     #                        (i,k,dist_ik),
##     #                        (j,k,dist_jk)]
##     # threshold is the minimum distance threshold
##     # 
##     # doesn't really matter what the points are,
##     # we simply use their indices.

##     if debug: print "Gathering all points indices"
##     assert(len(tools.unique(points))==len(points)) # make sure all points are unique
##     distances = tools.my_sort(distances,lambda d: -d[2]) # sort the distances from best score to worst
##     points_with_distances = tools.unique(tools.flatten(map(lambda d: [d[0],d[1]],distances))) # get the point names from distance table
##     assert(len(tools.set_intersect(points_with_distances,points))==len(points_with_distances)) # make sure all points mentioned are listed
##     if tools.set_subtract(points,points_with_distances): # and see which have no distances whatsoever
##         print "%s points with distances / %s points total (%2.0f%%).\nNo dist info for %s"%(
##             len(points_with_distances),len(points),100.0*len(points_with_distances)/len(points),
##             tools.set_subtract(points,points_with_distances))
##     # this step here is not needed. moreover, i'd like to output clusters in order of input points
##     #points = tools.unique(points_with_distances+points) # then list all points together

##     #print "Points are %s"%points

##     # here's a little trick on actually how we look indices up:
##     # instead of just looking at them in order, we first sort the
##     # index pairs, by the distance that separates them.
##     #
##     # then since, we always look things up in order, we'll end up
##     # joining the shortest motifs together first

##     if debug:
##         print "Making a dictionary for quick distance lookup"
        
##     distdic = {}
##     for i,j,dist in distances:
##         distdic[(min(i,j),max(i,j))] = dist

##     if debug:
##         print "Distance dictionary is: "
##         pp(distdic)
        
##     if debug:
##         print "%s points defined by %s distances"%(len(points),len(distances))
##         print "Distance distribution is: "
##         tools.quick_histogram(tools.cget(distances,2))

##     if debug: print "Making every point its own cluster"
##     references = {}
##     clusters = {}
##     used_names = {}
##     tools.mset(used_names,points,[None]*len(points))
##     maxi,p_i = -1,0
##     while p_i < len(points):
##         maxi = maxi+1
##         if used_names.has_key(maxi): continue
##         # we found an index for the cluster
##         clusters[maxi] = points2cluster([points[p_i]])
##         references[points[p_i]] = maxi
##         p_i = p_i+1
        
##     if debug:
##         print "The clusters are: "
##         pp(clusters,1)

##     #maxi = len(clusters)

##     hierarchy = {}
##     for cluster,elements in clusters.items():
##         assert(len(elements)==1)
##         hierarchy[cluster] = elements[0]

##     #print "Hierarchy is %s"%hierarchy

##     join_score = {}

##     suggestions = {}
##     vetos = {}
##     #references = {}

##     comparison_cache = {}

##     # two ways or sorting the order we visit the pairs
##     # 1) sort items
##     # 2) sort pairs

##     # 1) this is the sorting of the items:  bad
##     #    coz it leads to 1+1,2+1,3+1,4+1,5+1 joinings
##     # keep trying until no more joins are made
##     #indices = tools.my_sort(clusters.keys())
##     #pairs = []
##     #for i in range(0,len(indices)):
##     #    for j in range(i+1,len(indices)):
##     #        pairs.append((indices[i],indices[j]))
##     #pairs = tools.my_sort(pairs,max)

##     # 2) this is the sorting of the pairs
##     # this is better coz it does 1+1,1+1,2+2,1+1,4+2, etc
##     pairs = map(None, tools.cget(distances,0),tools.cget(distances,1))

##     # i will only compare points that had *any* similarity to start with

##     #boom

##     retry_all = 1
##     while retry_all:

##         if debug: print "Re-starting all the loops (%s by %s)"%(len(clusters),len(clusters))
##         if debug: print "Sizes are: %s"%tools.describe_elements(map(len,clusters.values()),lambda size: -size)

##         retry_all = 0

##         for i_tmp,j_tmp in pairs:

##             # probably joined and renamed
##             if clusters.has_key(i_tmp): i = i_tmp
##             else: i = follow_references(references, clusters, i_tmp, debug=debug)
            
##             if clusters.has_key(j_tmp): j = j_tmp
##             else: j = follow_references(references, clusters, j_tmp, debug=debug)


##             if debug: print "Testing Pair (%s,%s) now in clusters (%s,%s)"%(i_tmp,j_tmp,i,j)


##             # it is possible that we've already joined the clusters they now belong in
##             if i==j: continue
            
##             # cluster2cluster_distance

##             #if debug: 
##             #    print "Comparison cache is"
##             #    pp(comparison_cache,1)

##             # PLEASE NOTE!!  IF WE RE-USE CLUSTER NAMES, WE SHOULD FLUSH THE CACHE
##             # AT EVERY ITERATION
##             if comparison_cache.has_key((i,j)):
##                 score = comparison_cache[(i,j)]
##             else:
##                 score = compare2clusters(distdic, clusters[i], clusters[j])
##                 comparison_cache[(i,j)] = score

##             if debug: print "Avg max linkage between %s and %s is %s for %s%% tested and %s for %s tested"%(
##                 i,j,value1,perc1,value2,perc2)

##             if score > threshold:

##                 # now i know i'm joining, increment maxi
##                 maxi = maxi+1

##                 # make a new cluster out of the two joined ones
##                 newcluster = join_clusters(clusters[i],clusters[j])
##                 del(clusters[i])
##                 del(clusters[j])
##                 clusters[maxi] = newcluster

##                 hierarchy[maxi] = (hierarchy[i],hierarchy[j])
##                 join_score[maxi] = weighted_linkage
##                 del(hierarchy[i])
##                 del(hierarchy[j])

##                 references[i] = maxi
##                 references[j] = maxi

##                 #clusters[i] = newcluster
                
##                 retry_all = 1

##     keys = tools.cget(tools.my_sort(clusters.items(), lambda item: min(item[1])),0)
##     #keys = tools.my_sort(clusters.keys())
##     #keys = tools.my_sort(keys,lambda k,join_score=join_score: -(join_score.has_key(k) and join_score[k]))
##     # resorting the keys to put singletons at the end

##     #singletons = tools.lte(keys,len(points))
##     #joined     = tools.gt(keys,len(points))
##     #keys = joined+singletons

##     groups = tools.mget(clusters,keys)
##     hierarchy = tools.mget(hierarchy, keys)

##     #pp(hierarchy,3)
    
##     linkages = []
##     for group in groups: 
##         linkage = []
##         for m1 in range(0,len(group)):
##             for m2 in range(m1+1,len(group)):
##                 if distdic.has_key((group[m1],group[m2])): 
##                     linkage.append(distdic[(group[m1],group[m2])])
##                 else:
##                     linkage.append(0)
##         #linkages.append(tools.avg(linkage))
##         linkages.append(tools.reverse(tools.my_sort(linkage)))

##     #linkages, groups = tools.unpack(tools.reverse(tools.my_sort(map(None,linkages,groups))))
##     #lens, avglink, linkages, groups = tools.unpack(tools.reverse(tools.my_sort(
##     #    map(None,
##     #        map(len,groups),
##     #        map(tools.avg,linkages),
##     #        linkages,
##     #        groups))))
            
    
##     return groups, linkages, hierarchy


def avg_max_linkage(points, distances, threshold, linkage_type='max', debug=0):

    # distances is a list:  [(i,j,dist_ij),
    #                        (i,k,dist_ik),
    #                        (j,k,dist_jk)]
    # threshold is the minimum distance threshold
    # 
    # doesn't really matter what the points are,
    # we simply use their indices.

    if debug: print "Gathering all points indices"
    distances = tools.my_sort(distances,lambda d: -d[2])
    points_with_distances = tools.unique(tools.flatten(map(lambda d: [d[0],d[1]],distances)))
    assert(len(tools.set_intersect(points_with_distances,points))==len(points_with_distances))
    if tools.set_subtract(points,points_with_distances):
        print "%s points with distances / %s points total (%2.0f%%).\nNo dist info for %s"%(
            len(points_with_distances),len(points),100.0*len(points_with_distances)/len(points),
            tools.set_subtract(points,points_with_distances))
    points = tools.unique(points_with_distances+points)

    #print "Points are %s"%points

    # here's a little trick on actually how we look indices up:
    # instead of just looking at them in order, we first sort the
    # index pairs, by the distance that separates them.
    #
    # then since, we always look things up in order, we'll end up
    # joining the shortest motifs together first

    if debug:
        print "Making a dictionary for quick distance lookup"
        
    distdic = {}
    for i,j,dist in distances:
        distdic[(min(i,j),max(i,j))] = dist

    if debug:
        print "Distance dictionary is: "
        pp(distdic)
        
    if debug:
        print "%s points defined by %s distances"%(len(points),len(distances))
        print "Distance distribution is: "
        tools.quick_histogram(tools.cget(distances,2))

    if debug: print "Making every point its own cluster"
    references = {}
    clusters = {}
    used_names = {}
    tools.mset(used_names,points,[None]*len(points))
    maxi,p_i = -1,0
    while p_i < len(points):
        maxi = maxi+1
        if used_names.has_key(maxi): continue
        # we found an index for the cluster
        clusters[maxi] = [points[p_i]]
        references[points[p_i]] = maxi
        p_i = p_i+1
        
        
    if debug:
        print "The clusters are: "
        pp(clusters,1)

    #maxi = len(clusters)

    hierarchy = {}
    for cluster,elements in clusters.items():
        assert(len(elements)==1)
        hierarchy[cluster] = elements[0]

    #print "Hierarchy is %s"%hierarchy

    join_score = {}

    suggestions = {}
    vetos = {}
    #references = {}

    comparison_cache = {}

    # two ways or sorting the order we visit the pairs
    # 1) sort items
    # 2) sort pairs

    # 1) this is the sorting of the items:  bad
    #    coz it leads to 1+1,2+1,3+1,4+1,5+1 joinings
    # keep trying until no more joins are made
    #indices = tools.my_sort(clusters.keys())
    #pairs = []
    #for i in range(0,len(indices)):
    #    for j in range(i+1,len(indices)):
    #        pairs.append((indices[i],indices[j]))
    #pairs = tools.my_sort(pairs,max)

    # 2) this is the sorting of the pairs
    # this is better coz it does 1+1,1+1,2+2,1+1,4+2, etc
    pairs = map(None, tools.cget(distances,0),tools.cget(distances,1))

    #boom

    retry_all = 1
    while retry_all:

        if debug: print "Re-starting all the loops (%s by %s)"%(len(clusters),len(clusters))
        if debug: print "Sizes are: %s"%tools.describe_elements(map(len,clusters.values()),lambda size: -size)

        retry_all = 0

        for i_tmp,j_tmp in pairs:

            # probably joined and renamed
            if clusters.has_key(i_tmp): i = i_tmp
            else: i = follow_references(references, clusters, i_tmp, debug=debug)
            
            if clusters.has_key(j_tmp): j = j_tmp
            else: j = follow_references(references, clusters, j_tmp, debug=debug)


            if debug: print "Testing Pair (%s,%s) now in clusters (%s,%s)"%(i_tmp,j_tmp,i,j)


            # it is possible that we've already joined the clusters they now belong in
            if i==j: continue
            
            # cluster2cluster_distance

            #if debug: 
            #    print "Comparison cache is"
            #    pp(comparison_cache,1)

            # PLEASE NOTE!!  IF WE RE-USE CLUSTER NAMES, WE SHOULD FLUSH THE CACHE
            # AT EVERY ITERATION
            if comparison_cache.has_key((i,j)):
                value1, perc1, value2, perc2, weighted_linkage = comparison_cache[(i,j)]
            else: 
                value1, perc1 = get_avg_linkage(distdic, clusters[i], clusters[j], linkage_type)
                value2, perc2 = get_avg_linkage(distdic, clusters[j], clusters[i], linkage_type)
                weighted_linkage = tools.weighted_avg([value1,value2],[len(clusters[i]),len(clusters[j])])
                comparison_cache[(i,j)] = (value1,perc1,value2,perc2,weighted_linkage)

            if debug: print "Avg max linkage between %s and %s is %s for %s%% tested and %s for %s tested"%(
                i,j,value1,perc1,value2,perc2)

            if 0: #join_score.has_key(i) and join_score.has_key(j):
                if weighted_linkage < .5*min(join_score[j],join_score[i]):
                    #if 0: print "I veto joining %s to %s.  (%2.0f,%2.0f) -> %2.0f is too big a drop in linkage"%(
                    #    tools.display_list(clusters[i],format='m%s'),
                    #    tools.display_list(clusters[j],format='m%s'),
                    #    join_score[j],join_score[i],weighted_linkage)
                    #suggestions[(i,j)] = weighted_linkage
                    pass
                
            if value1*perc1/100.0 > threshold and value2*perc2/100.0 > threshold:

                #if join_score.has_key(i) and join_score.has_key(j):
                #    if weighted_linkage < .5*min(join_score[j],join_score[i]):
                #        print "I veto joining %s to %s.  (%2.0f,%2.0f) -> %2.0f is too big a drop in linkage"%(
                #            tools.display_list(clusters[i],format='m%s'),
                #            tools.display_list(clusters[j],format='m%s'),
                #            join_score[j],join_score[i],weighted_linkage)
                #        continue
                
                # now i know i'm joining, increment maxi
                maxi = maxi+1


                if debug:

                    if join_score.has_key(i): join_score_i = '%2.0f linkage'%join_score[i]
                    else: join_score_i = 'singleton'
                    if join_score.has_key(j): join_score_j = '%2.0f linkage'%join_score[j]
                    else: join_score_j = 'singleton'
                    
                    if debug: print "Joining clu_%s (%s items, %s) and clu_%s (%s items, %s) -> clu_%s (%s items, %2.0f linkage)"%(
                        i,len(clusters[i]),join_score_i,
                        j,len(clusters[j]),join_score_j,
                        maxi,len(clusters[i])+len(clusters[j]),weighted_linkage)
                
                # make a new cluster out of the two joined ones
                newcluster = clusters[i]+clusters[j]
                del(clusters[i])
                del(clusters[j])
                clusters[maxi] = newcluster

                hierarchy[maxi] = (hierarchy[i],hierarchy[j])
                join_score[maxi] = weighted_linkage
                del(hierarchy[i])
                del(hierarchy[j])

                references[i] = maxi
                references[j] = maxi

                #clusters[i] = newcluster
                
                retry_all = 1

    keys = tools.cget(tools.my_sort(clusters.items(), lambda item: min(item[1])),0)
    #keys = tools.my_sort(clusters.keys())
    #keys = tools.my_sort(keys,lambda k,join_score=join_score: -(join_score.has_key(k) and join_score[k]))
    # resorting the keys to put singletons at the end

    #singletons = tools.lte(keys,len(points))
    #joined     = tools.gt(keys,len(points))
    #keys = joined+singletons

    groups = tools.mget(clusters,keys)
    hierarchy = tools.mget(hierarchy, keys)

    #pp(hierarchy,3)
    
    linkages = []
    for group in groups: 
        linkage = []
        for m1 in range(0,len(group)):
            for m2 in range(m1+1,len(group)):
                if distdic.has_key((group[m1],group[m2])): 
                    linkage.append(distdic[(group[m1],group[m2])])
                else:
                    linkage.append(0)
        #linkages.append(tools.avg(linkage))
        linkages.append(tools.reverse(tools.my_sort(linkage)))

    #linkages, groups = tools.unpack(tools.reverse(tools.my_sort(map(None,linkages,groups))))
    #lens, avglink, linkages, groups = tools.unpack(tools.reverse(tools.my_sort(
    #    map(None,
    #        map(len,groups),
    #        map(tools.avg,linkages),
    #        linkages,
    #        groups))))
            
    
    return groups, linkages, hierarchy


def get_avg_linkage(distdic, cluster1, cluster2, linkage_type='max'):
    # returns the avg

    max_similarity = []

    # for every poitn in cluster1
    for i in cluster1:
        
        # find the closest point in cluster2
        #dists_i = tools.mget(distdic, map(lambda j,i=i: (min(i,j),max(i,j)), cluster2), 0)
        dists_i = []
        for j in cluster2:

            key = min(i,j),max(i,j)
            if distdic.has_key(key): dists_i.append(distdic[key])

            
        # and average those distances
        if dists_i:
            if linkage_type=='avg': max_similarity.append(tools.avg(dists_i))
            elif linkage_type=='max': max_similarity.append(max(dists_i))
            elif linkage_type=='min': max_similarity.append(min(dists_i))
            else: raise ValueError, linkage_type
        #if dists_i: max_similarity.append(max(dists_i))

    # then return the avg max similarity, and well as the percent for which we had values
    return tools.avg(max_similarity), 100.0*len(max_similarity)/len(cluster1)
        
def cluster_clusters(grouping1, grouping2):

    groups, linkages, hierarchies = grouping1
    ggroups, glinkages, ghierarchies = grouping2

    newgroups, newlinkages, newhierarchies = [], [], []
    for ggroup,glinkage,ghierarchy in map(None, ggroups, glinkages, ghierarchies):

        newhierarchies.append(tools.map_on_hierarchy_safe(ghierarchy, tools.list2dic_i(hierarchies).get))

        newgroups.append(tools.flatten(tools.mget(groups, ggroup)))

        #print "Linkages of the previous groups: %s"%tools.mget(linkages,ggroup)
        #print "Current linkage: %s"%glinkage
        newlinkages.append(tools.flatten(tools.mget(linkages,ggroup))+glinkage)

    return newgroups, newlinkages, newhierarchies