# Longest distance metric
This metric creates clusters using distance matrices, then flattens the dendrogram onto a line and records the longest distance one can travel on that line without crossing a leaf.

_To create the distance matrices:_
The distance matrices are the last step in the data wrangling process.
The necessary code is in `data_wrangling.jl`.
Begin by applying `make_df_pairs()` to the `df_single` files, followed by `make_df_cor()` and `create_distance_matrices()`.
Please to `data_wrangling.md` for clear instructions and to `data_wrangling.jl` for the corresponding Julia code.

In [7]:
# DEPENDENCIES
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.cluster import hierarchy
import seaborn as sns
import os

In [None]:
# HELPER FUNCTIONS

def read_dists(dfile):
    """
    correctly read saved distance matrices
    """
    content_array = []
    with open(dfile) as f:
        # content_array is the list that contains the read lines.    
        for line in f:
            line = line[:-3]
            line = int(line)
            content_array.append(line)
        return content_array

    
def get_params(distmatrix):
    """
    read parameter configuration from filename
    """
    params = distmatrix[0:-4]
    _, popsize, host, memsize, numskills = params.split("_")
    
    popsize = int(popsize)
    host = float(host)
    memsize = int(memsize)
    numskills = int(numskills)
    
    return popsize, host, memsize, numskills

In [None]:
# MAIN FUNCTION

def main(distpath, distmatrix):
    """
    `distpath`: path to distance matrix, ending in "/"
    `distmatrix`: name of file with distance matrix
    """
    
    # get data
    params = get_params(distmatrix)
    dists = read_dists(distpath+distmatrix)
    
    # nothing to do if there are no distances
    # occurred a few times for very small populations with very small vocabulary
    if len(dists) == 0:
        return None
    
    # perform clustering
    Z = hierarchy.linkage(dists, method="average")
    
    # normalise distances to ]0,1[ interval
    max_clust_dist = max([i[2] for i in Z])
    for i in Z:
        i[2] /= max_clust_dist
    
    # "flatten dendrogram onto line"
    dendline = Z[:,2]
    
    # find longest segment
    ls = 0.0
    for i  in range(len(dendline)-1):
        s = dendline[i+1] - dendline[i]
        if s > ls:
            ls = s
    
    # append to lists
    popsize.append(params[0])
    host.append(params[1])
    memsize.append(params[2])
    numskills.append(params[3])
    maxsegment.append(ls)

In [None]:
# RUN
# create lists to populate and create DataFrame from
# Note: this takes a while
popsize = []
host = []
memsize = []
numskills = []
maxsegment = []

# paths
distpath = "netlogo_output/dist_matrices/"

# iterate over files
for i in range(1, 31):
    for file in os.listdir(distpath+str(i)+"/"):
        filename = os.fsdecode(file)
        # verify file
        a = filename.endswith(".csv")
        b = filename.startswith("v")
        if a and b:
            main(distpath+str(i)+"/", file)
    print("Done with {}".format(i))
                

In [None]:
# create DataFrame from lists
df = pd.DataFrame({
    "popsize": popsize,
    "host": host,
    "memsize": memsize,
    "numskills": numskills,
    "maxsegment": maxsegment
})
df.head()

In [None]:
# the file is called `df_segments.csv`, changed here so as not to unintentionally overwrite
df.to_csv("df_s.csv")