
import { parse } from "csv-parse";
import ml from 'ml-kmeans';
import mlhc from 'ml-hclust';
import mldc from 'density-clustering';
import fs from 'fs-extra';
import { compress, decompress } from 'compress-json';

// const INPUT_CSV = 'wfgh3.csv';
// const OUTPUT_FOLDER = 'output/wfgh3/';
// const DATE_FORMAT = 'unix';

// const INPUT_CSV = 'oas.commits.metrics.clean.csv';
// const OUTPUT_FOLDER = 'output/oas/';
// const DATE_FORMAT = 'unix';

// const INPUT_CSV = 'oas.commits.metrics.clean.filter5.csv';
// const OUTPUT_FOLDER = 'output/oas5/';
// const DATE_FORMAT = 'date';

// // const INPUT_CSV = 'api_clustering_diff_vector.csv';
// // const OUTPUT_FOLDER = 'output/oas5cluster/';
// // const DATE_FORMAT = 'date';

// const INPUT_CSV = 'commit_file_sizes_vscode.csv';
// const OUTPUT_FOLDER = 'output/vscode/';
// const DATE_FORMAT = 'date';

const INPUT = process.argv[2];
const INPUT_CSV = process.argv[3] || 'input/commit_file_sizes_'+INPUT+'.fast.csv';

const OUTPUT_FOLDER = 'output/'+INPUT+'/';
const DATE_FORMAT = process.argv[4] || 'date';

// const INPUT_CSV = 'commit_file_sizes_bitcoin.fast.csv';
// const OUTPUT_FOLDER = 'output/bitcoin/';
// const DATE_FORMAT = 'date';

// const INPUT_CSV = 'commit_file_sizes_node.fast.csv';
// const OUTPUT_FOLDER = 'output/node/';
// const DATE_FORMAT = 'date';

const OUTPUT_FOLDER_COMPRESSED = OUTPUT_FOLDER.replace("output/", "output_compressed/");

fs.ensureDirSync(OUTPUT_FOLDER);
fs.ensureDirSync(OUTPUT_FOLDER_COMPRESSED);

// const INPUT_CSV = '/Volumes/T9/apis/apis.commits.csv';

let api_spec_ids = {};
let reverse_api_spec_ids = [];

function cleanup(data) {

    let years = {};

    data = data.filter(d=>{
        // let commit_date = dayjs(d.commit_date);

        let commit_date;

        if (DATE_FORMAT == 'unix') {
            commit_date = dayjs.unix(parseInt(d.commit_date));
        } else {
            commit_date = dayjs(d.commit_date);
        }

        // let commit_date = dayjs.unix(parseInt(d.commit_date));
        // let commit_date = dayjs.unix(parseInt(d.date));

        years[commit_date.year()] = years[commit_date.year()] || 0;
        years[commit_date.year()]++;

        if (commit_date.year() >= 2026 || commit_date.year() < 1980) {
            console.log("Bad date", d.commit_date);
            return false;
        }
        return true;
    });

    console.log(years);

    fs.writeJSONSync(OUTPUT_FOLDER + "hist_years.json", years);
    fs.writeJSONSync(OUTPUT_FOLDER_COMPRESSED + "hist_years.json", years);

    //map the repo + file_path to a unique id

    let extensions = {};

    data.forEach(d => {

        if (d.api_spec_id !== undefined) {
            d.file_path = d.api_spec_id;
            d.repository = d.owner + ":" + d.repo_name;
        } 

        if (d.filename !== undefined) {
            d.file_path = d.filename;
            d.repository = "";

            //this one will treat .files as extensions
            let ext = d.file_path.split("/").pop().split(".").pop();
            extensions[ext] = extensions[ext] || 0;
            extensions[ext]++;

            d.ext = ext;
        } 

        //PATCH for missing file_path
        if (d.file_path === undefined) {
            d.file_path = d.name.split("_")[1];
        }
        
        api_spec_ids[d.repository + ":" + d.file_path] = api_spec_ids[d.repository + ":" + d.file_path] || 0;
        api_spec_ids[d.repository + ":" + d.file_path]++;
    });

    // fs.writeJSONSync(OUTPUT_FOLDER + "api_spec_ids.json", api_spec_ids);

    let exts = Object.keys(extensions);

    if (Object.keys(extensions).length > 0) {
    
        console.log("Extensions", extensions);

        let short_extensions = Object.keys(extensions).filter(e=>e.length < 5).sort((a,b)=>extensions[b]-extensions[a]);
    
        fs.writeJSONSync(OUTPUT_FOLDER + "extensions.json", short_extensions);
        fs.writeJSONSync(OUTPUT_FOLDER_COMPRESSED + "extensions.json", short_extensions);

         data.forEach(d => {

            //assumes methods_count is not used for other stuff
            // d.methods_count = short_extensions.indexOf(d.ext);

            if (short_extensions.indexOf(d.ext) == -1) {
                d.short_extension = short_extensions.length;
            } else {
                d.short_extension = short_extensions.indexOf(d.ext);
            }
            
         });

    }

    let api_spec_ids_keys = new Map(); 
    
    Object.keys(api_spec_ids).sort().forEach((k,i)=>{
        api_spec_ids_keys.set(k,i);
        reverse_api_spec_ids[i] = k;
    });

    data.forEach(d => {
        d.api_spec_id = api_spec_ids_keys.get(d.repository + ":" + d.file_path);
    });

    let h_methods_count = {};
    data.forEach(d => {
        h_methods_count[d.methods_count] = h_methods_count[d.methods_count] || 0;
        h_methods_count[d.methods_count]++;
    })

    let h_sorted_methods_count = Object.keys(h_methods_count).sort((a,b)=>a-b).map(k=>[parseInt(k), h_methods_count[k]]);

    console.log("Methods count", h_sorted_methods_count);

    fs.writeJSONSync(OUTPUT_FOLDER + "hist_methods_count.json", h_sorted_methods_count);
    fs.writeJSONSync(OUTPUT_FOLDER_COMPRESSED + "hist_methods_count.json", h_sorted_methods_count);

    let h_extensions = {};
    data.forEach(d => {
        h_extensions[d.short_extension] = h_extensions[d.short_extension] || 0;
        h_extensions[d.short_extension]++;
    })

    console.log("Short extensions", h_extensions);

    fs.writeJSONSync(OUTPUT_FOLDER + "hist_extensions.json", h_extensions);
    fs.writeJSONSync(OUTPUT_FOLDER_COMPRESSED + "hist_extensions.json", h_extensions);

    return data;
}

const parser = parse({ columns: true,
    skip_empty_lines: true}, function (err, data) {

// fetch("apis.csv").then(res=>res.text()).then(text=>{
    // const data = parse(text, {columns: true,
    //     skip_empty_lines: true});

    console.log("Commits: ", data.length);

    data = cleanup(data);

    const n = normalize(data);

    function similar(api_stats, metric, cache_matrix = true) {

        let cache = {};
        let matrix = {};

        if (cache_matrix) {
            return with_cache;
        } else {
            return without_cache;
        }

        function with_cache(a,b) {

            let matrix_key = a + "_" + b;
            if (matrix[matrix_key] !== undefined) {
                return matrix[matrix_key];
            }

            matrix[matrix_key] = without_cache(a,b);
            return matrix[matrix_key];

        }

        function without_cache(a,b) {

            let array_a = cache[a];
            let array_b = cache[b];
            if (array_a === undefined) {
                array_a = cache[a] = data.filter(d=>d.api_spec_id == a);
            }
            if (array_b === undefined) {
                array_b = cache[b] = data.filter(d=>d.api_spec_id == b);
            }

            //euclidean distance (assumes metric will be defined for all elements even if undefined)
            let d = 0;
            for (let i = 0; i < Math.max(array_a.length,array_b.length); i++) {
                d += Math.pow(metric(array_a[i]) - metric(array_b[i]), 2);
            }

            d = Math.sqrt(d);
            return d;
    
        };
    }

    function sortBySimilarityDC(arrays, metric, euclideanDistance) {

        const n = arrays.length;
        let maxLength = 1;
        for (let i = 0; i < n; i++) {
            if (arrays[i].length > maxLength) {
                maxLength = arrays[i].length;
            }
        }
        // const maxLength = Math.max(...arrays.map(a => a.length));

        // Estimate cluster count if not provided
        let clusterCount = Math.max(2, Math.round(Math.sqrt(n / 2)));

        const dataset = arrays.map(a =>
            Array.from({ length: maxLength }, (_, i) => metric(a[i]) || 0)
        );

        let kmeans = mldc.KMEANS();
        let clusters = kmeans.run(dataset, clusterCount);





    }

    //super slow, does not work
    function sortBySimilarityHC(arrays, euclideanDistance) {

        console.log("sortBySimilarityHC", arrays.length);

        let matrix = [];

        for (let i = 0; i < arrays.length; i++) {
            matrix[i] = [];
            matrix[i][i] = 0;
            for (let j = i+1; j < arrays.length; j++) {
                matrix[i][j] = euclideanDistance(arrays[i], arrays[j]);
            }
        }
        for (let i = 0; i < arrays.length; i++) {
            for (let j = 0; j < i; j++) {
                matrix[i][j] = matrix[j][i];
            }
        }

        fs.writeJSONSync(OUTPUT_FOLDER + "matrix.json", matrix);
     
        const result = mlhc.agnes(matrix, {
            isDistanceMatrix: true,
            method: 'complete',
            distance: (a, b) => euclideanDistance(a, b),
        });

        const indices = result.indices();   

        console.log(indices);

        return indices.map((index) => {
            return arrays[index];
        });

    }

    //this one seems to work apart from the need to estimate number of clusters
    function sortBySimilarity(arrays, euclideanDistance, clusterCount = null) {
     
        function sortCluster(arrays, center) {
            return arrays.sort((a, b) =>
                euclideanDistance(a, center) - euclideanDistance(b, center)
            );
        }
     
        const n = arrays.length;
        let maxLength = 1;
        for (let i = 0; i < n; i++) {
            if (arrays[i].length > maxLength) {
                maxLength = arrays[i].length;
            }
        }
        // const maxLength = Math.max(...arrays.map(a => a.length));

        // Estimate cluster count if not provided
        if (!clusterCount) {
            clusterCount = Math.max(2, Math.round(Math.sqrt(n / 2)));
        }

        const padded = arrays.map(a =>
            Array.from({ length: maxLength }, (_, i) => a[i] || 0)
        );

        const kmeans = ml.kmeans(padded, clusterCount);

        const clusters = Array.from({ length: clusterCount }, () => []);
        kmeans.clusters.forEach((clusterIndex, i) => {
            clusters[clusterIndex].push(arrays[i]);
        });

        const sorted = clusters.flatMap((cluster, i) =>
            sortCluster(cluster, kmeans.centroids[i].centroid)
        );

        return sorted;
    }

    function sortBySimilaritySlow(arrays, euclideanDistance) {
        const remaining = [...arrays];
        const sorted = [];

        // Start with the first array
        sorted.push(remaining.shift());

        while (remaining.length > 0) {
            const last = sorted[sorted.length - 1];
            let minDist = Infinity;
            let closestIndex = -1;

            remaining.forEach((item, index) => {
                const dist = euclideanDistance(last, item);
                if (dist < minDist) {
                    minDist = dist;
                    closestIndex = index;
                }
            });

            sorted.push(remaining.splice(closestIndex, 1)[0]);
        }

        return sorted;
        }

    sorter(n, "points_delta_avg.json", (apis, api_stats)=>{
        return apis.sort((a,b)=> (api_stats[a].delta_methods_count_sum/api_stats[a].count) - api_stats[b].delta_methods_count_sum/api_stats[b].count);
    });

    return;

    sorter(n, "points_double_delta2.json", (apis, api_stats) => {

        return apis.sort((a, b) => {
            let de = -(api_stats[a].methods_count_last - api_stats[b].methods_count_last);

            if (de == 0) {
                de = api_stats[a].delta_class_number - api_stats[b].delta_class_number;

                if (de == 0) {
                    de = api_stats[a].last_commit - api_stats[b].last_commit;
                }
            }

            return de;
        });

    });

    sorter(n, "points_double2_delta2.json", (apis, api_stats) => {

        return apis.sort((a, b) => {
            let de = -(api_stats[b].methods_count_last - api_stats[a].methods_count_last);

            if (de == 0) {
                de = api_stats[b].delta_class_number - api_stats[a].delta_class_number2;

                if (de == 0) {
                    de = api_stats[a].last_commit - api_stats[b].last_commit;
                }
            }

            return de;
        });

    });

    sorter(n, "points_delta_class_number.json", (apis, api_stats)=>{
        return apis.sort((a,b)=>api_stats[a].delta_class_number - api_stats[b].delta_class_number);
    });

    sorter(n, "points_delta_sum.json", (apis, api_stats)=>{
        return apis.sort((a,b)=>api_stats[a].delta_methods_count_sum - api_stats[b].delta_methods_count_sum);
    });

    sorter(n, "points_double_delta.json", (apis, api_stats) => {

        return apis.sort((a, b) => {
            let de = api_stats[a].delta_class_number - api_stats[b].delta_class_number;

            if (de == 0) {
                de = api_stats[a].delta_methods_count - api_stats[b].delta_methods_count;

                if (de == 0) {
                    de = api_stats[a].methods_count_last - api_stats[b].methods_count_last;
                }
            }

            return de;
        });

    });

    sorter(n, "points_ext_path.json", (apis, api_stats) => {

        return apis.sort((a, b) => {
            let de = api_stats[a].short_extension - api_stats[b].short_extension;

            if (de == 0) {
                return api_stats[a].file_path.localeCompare(api_stats[b].file_path);
            }

            return de;
        });

    });

    sorter(n, "points_file_path.json", (apis, api_stats)=>{
        return apis.sort((a,b)=>api_stats[a].file_path.localeCompare(api_stats[b].file_path));
    });


    sorter(n, "points_short_extension.json", (apis, api_stats)=>{
        return apis.sort((a,b)=>api_stats[a].short_extension - api_stats[b].short_extension);
    });




    // sorter(n, "points_commit_similar_hc_date.json", (apis, api_stats)=>{
    //     console.log("Sorting by commit date with HC similarity");
    //     return sortBySimilarityHC(apis, similar(api_stats, (d)=> (d === undefined) ? 0 : d.commit_date.unix()), false);
    // });



    // sorter(n, "points_methods_similar_hc_count.json", (apis, api_stats)=>{
    //     return sortBySimilarityHC(apis, similar(api_stats, (d)=> (d === undefined) ? 0 : d.methods_count || 0));
    //     // return apis.sort(similar(api_stats, (d)=>(d === undefined) ? 0 : d.methods_count || 0));
    // });
    
    sorter(n, "points_methods_similar_count.json", (apis, api_stats)=>{
        return sortBySimilarity(apis, similar(api_stats, (d)=> (d === undefined) ? 0 : d.methods_count || 0));
        // return apis.sort(similar(api_stats, (d)=>(d === undefined) ? 0 : d.methods_count || 0));
    });

    sorter(n, "points_commit_similar_date.json", (apis, api_stats)=>{
        return sortBySimilarity(apis, similar(api_stats, (d)=> (d === undefined) ? 0 : d.commit_date.unix()));
        // return apis.sort(similar(api_stats, (d)=> (d === undefined) ? 0 : d.commit_date.unix()));
    });

    sorter(n, "points_commit_similar_age.json", (apis, api_stats)=>{
        return sortBySimilarity(apis, similar(api_stats, (d)=> (d === undefined) ? 0 : d.commit_age));
    //     return apis.sort(similar(api_stats, (d)=> (d === undefined) ? 0 : d.commit_age));
    });

    sorter(n, "points_commit_similar_normalage.json", (apis, api_stats)=>{
        return sortBySimilarity(apis, similar(api_stats, (d)=> (d === undefined) ? 0 : d.normalized_commit_age));
        // return apis.sort(similar(api_stats, (d)=> (d === undefined) ? 0 : d.normalized_commit_age));
    });

    // return;

    function delta_methods_count_wave2(api_stats) {
        return (a,b)=>{
    
            let dc = api_stats[a].delta_methods_count - api_stats[b].delta_methods_count;
            
            if (dc == 0) {
                
                let df = api_stats[a].methods_count_first - api_stats[b].methods_count_first;
    
                if (df == 0) {
    
                    return api_stats[a].methods_count_last - api_stats[b].methods_count_last;
    
                } else {
    
                    return df;
    
                }
    
            }
    
            return dc;
        };
    }

    sorter(n, "points_delta_methods_wave2_count.json", (apis, api_stats)=>{
        return apis.sort(delta_methods_count_wave2(api_stats));
    });



    sorter(n, "points_mid.json", (apis, api_stats)=>{
        return apis.sort((a,b)=>api_stats[a].mid_commit - api_stats[b].mid_commit);
    });

    function delta_methods_count_wave(api_stats) {
        return (a,b)=>{
    
            let dc = api_stats[a].methods_count_first - api_stats[b].methods_count_first;
    
            if (dc == 0) {
    
                let df = api_stats[a].delta_methods_count - api_stats[b].delta_methods_count;
    
                if (df == 0) {
    
                    return api_stats[a].methods_count_last - api_stats[b].methods_count_last;
    
                } else {
    
                    return df;
    
                }
    
            }
    
            return dc;
        };
    }

    sorter(n, "points_delta_methods_wave_count.json", (apis, api_stats)=>{
        return apis.sort(delta_methods_count_wave(api_stats));
    });

    sorter(n, "points_methods_count_avg.json", (apis, api_stats)=>{
        return apis.sort((a,b)=> (api_stats[a].methods_count / api_stats[a].count) - (api_stats[b].methods_count / api_stats[b].count));
    });

    function methods_count_wave(api_stats) {
        return (a,b)=>{
    
            let dc = api_stats[a].methods_count_first - api_stats[b].methods_count_first;
    
            if (dc == 0) {
    
                let df = api_stats[a].methods_count_last - api_stats[b].methods_count_last;
    
                if (df == 0) {
    
                    return (api_stats[a].methods_count / api_stats[a].count) - (api_stats[b].methods_count / api_stats[b].count)
    
                } else {
    
                    return df;
    
                }
    
            }
    
            return dc;
        };
    }
    

    sorter(n, "points_methods_wave_count.json", (apis, api_stats)=>{
        return apis.sort(methods_count_wave(api_stats));
    });

    sorter(n, "points_index.json", (apis, api_stats)=>{
        return apis.sort((a,b)=>api_stats[a].i - api_stats[b].i);
    });

    sorter(n, "points_methods_count_first.json", (apis, api_stats)=>{
        return apis.sort((a,b)=>api_stats[a].methods_count_first - api_stats[b].methods_count_first);
    });

    sorter(n, "points_methods_count_last.json", (apis, api_stats)=>{
        return apis.sort((a,b)=>api_stats[a].methods_count_last - api_stats[b].methods_count_last);
    });


    sorter(n, "points_first.json", (apis, api_stats)=>{
        return apis.sort((a,b)=>api_stats[a].first_commit - api_stats[b].first_commit);
    });

    sorter(n, "points_last.json", (apis, api_stats)=>{
        return apis.sort((a,b)=>api_stats[a].last_commit - api_stats[b].last_commit);
    });

    sorter(n, "points_count.json", (apis, api_stats)=>{
        return apis.sort((a,b)=>api_stats[a].count - api_stats[b].count);
    });

    sorter(n, "points_methods_count.json", (apis, api_stats)=>{
        return apis.sort((a,b)=>api_stats[a].methods_count - api_stats[b].methods_count);
    });

    sorter(n, "points_ndatesum.json", (apis, api_stats)=>{
        return apis.sort((a,b)=>api_stats[a].ndatesum - api_stats[b].ndatesum);
    });

    function wave(api_stats) {
        return (a,b)=>{
    
            let dc = api_stats[a].count - api_stats[b].count;
    
            if (dc == 0) {
    
                let df = api_stats[a].last_commit - api_stats[b].last_commit;
    
                if (df == 0) {
    
                    return api_stats[a].first_commit - api_stats[b].first_commit;
    
                } else {
    
                    return df;
    
                }
    
            }
    
            return dc;
        };
    }

    sorter(n, "points_wave.json", (apis, api_stats)=>{
        return apis.sort(wave(api_stats));
    });

    // [1,2,3,4,5,10,20].forEach(MIN_COMMITS=>{

    //     sorter(n, `points_wave_filtered_${MIN_COMMITS}.json`, (apis, api_stats)=>{
    //         return apis.sort(wave(api_stats));
    //     }, MIN_COMMITS);

    // });

    sorter(n, "points_age.json", (apis, api_stats)=>{
        return apis.sort((a,b)=>api_stats[a].age - api_stats[b].age);
    });

    // [1,5,10,20].forEach(MIN_COMMITS=>{

    //     sorter(n, `points_age_filtered_${MIN_COMMITS}.json`, (apis, api_stats)=>{
    //         return apis.sort((a,b)=>api_stats[a].age - api_stats[b].age);
    //     }, MIN_COMMITS);

    // });




    // console.log(points.length);

    // fs.writeJSONSync('points.json', points);
});

fs.createReadStream(INPUT_CSV).pipe(parser);

import dayjs from 'dayjs';
import minMax from 'dayjs/plugin/minMax.js';

dayjs.extend(minMax);

function normalize(data) {

    let dates;

    if (DATE_FORMAT == 'unix') {
        //apis
        dates = data.map(d=>dayjs.unix(d.commit_date));
    } else {
        //github
        dates = data.map(d=>dayjs(d.commit_date));
    }

    let minDate = dayjs.min(dates);
    let maxDate = dayjs.max(dates);

    console.log("Dates:", minDate, maxDate);

    let range = maxDate.diff(minDate, 'day', true);

    let diff = dates.map(d=>d.diff(minDate, 'day', true));

    let ndates = diff.map(d=>d/range);

    console.log("Date range:", range);

    let distinct_dates = new Set(dates.map(d=>d.format('YYYY-MM-DD HH:mm:ss')));

    let apis = Array.from(new Set(data.map(d=>parseInt(d.api_spec_id))));

    console.log("APIs", apis.length);

    let stats = {
        api_count: apis.length,
        commit_count: data.length, //events
        min_date: minDate,
        max_date: maxDate,
        range_date: range,
        distinct_dates: distinct_dates.size
    }

    fs.writeJSONSync(OUTPUT_FOLDER + "stats.json", stats);
    fs.writeJSONSync(OUTPUT_FOLDER_COMPRESSED + "stats.json", stats);

    let api_stats = {};

    data.forEach((d,i)=>{

        if(api_stats[d.api_spec_id] === undefined) {
            api_stats[d.api_spec_id] = {first_commit: dates[i], last_commit: dates[i], count: 1, ndatesum: ndates[i], methods_count: parseInt(d.methods_count || "0") };
            api_stats[d.api_spec_id].short_extension = d.short_extension; //assume they are all the same
            api_stats[d.api_spec_id].owner = d.repository.split(":")[0]; //assume they are all the same
            api_stats[d.api_spec_id].repo_name = d.repository.split(":")[1]; //assume they are all the same
            api_stats[d.api_spec_id].file_path = d.file_path; //assume they are all the same
        } else {
            api_stats[d.api_spec_id].first_commit = dayjs.min([api_stats[d.api_spec_id].first_commit, dates[i]]);
            api_stats[d.api_spec_id].last_commit = dayjs.max([api_stats[d.api_spec_id].last_commit, dates[i]]);
            api_stats[d.api_spec_id].count++;
            api_stats[d.api_spec_id].ndatesum += ndates[i];
            api_stats[d.api_spec_id].methods_count += parseInt(d.methods_count || "0");
        }

    });

    Object.keys(api_stats).forEach((k,i)=>{

        api_stats[k].age = api_stats[k].last_commit.diff(api_stats[k].first_commit, 'day', true);
        api_stats[k].mid_commit = api_stats[k].first_commit.add(api_stats[k].last_commit.diff(api_stats[k].first_commit, 'day', true) / 2, "day");
        api_stats[k].i = i;

    });

    data.forEach((d,i)=>{

        d.commit_date = dates[i];

        d.commit_age = dates[i].diff(api_stats[d.api_spec_id].first_commit, 'day', true);
        d.commit_age_reverse = -dates[i].diff(api_stats[d.api_spec_id].last_commit, 'day', true);
        //commit age in the middle between first and last commit
        d.commit_age_middle = dates[i].diff(api_stats[d.api_spec_id].mid_commit, 'day', true);

        if (api_stats[d.api_spec_id].age == 0) {
            d.normalized_commit_age = 0;
        } else {
            d.normalized_commit_age = d.commit_age / api_stats[d.api_spec_id].age;
            //console.log(data.commit_age, data.normalized_commit_age, api_stats[d.api_spec_id].age);
        }

        if (dates[i] == api_stats[d.api_spec_id].first_commit) {
            api_stats[d.api_spec_id].methods_count_first = parseInt(d.methods_count || "0");
        }

        if (dates[i] == api_stats[d.api_spec_id].last_commit) {
            api_stats[d.api_spec_id].methods_count_last = parseInt(d.methods_count || "0");
        }

    });

    Object.keys(api_stats).forEach(k=>{

        api_stats[k].delta_methods_count = api_stats[k].methods_count_last - api_stats[k].methods_count_first;

    });

    // every API has methods_count along their history
    let methods_counts = {};

    data.forEach((d,i)=>{

        methods_counts[d.api_spec_id] = methods_counts[d.api_spec_id] || [];
        methods_counts[d.api_spec_id].push({d, commit_date: dates[i], methods_count: parseInt(d.methods_count || "0")});
        
    });

    //make sure the commits are sorted by date
    Object.keys(methods_counts).forEach(k=>{

        methods_counts[k].sort((a,b)=>a.commit_date.diff(b.commit_date));

    });

    let hist_delta_methods_counts = {};

    const DELTA_CLASS_LARGE = 8192;
    const DELTA_CLASS_SMALL = 1024;
    const DELTA_CLASS_TINY = 16;

    //scan the methods_counts and compute the deltas
    Object.keys(methods_counts).forEach(k=>{

        let prev = methods_counts[k][0].methods_count;

        methods_counts[k].forEach((mc,i)=>{

            let result = mc.methods_count - prev;

            prev = mc.methods_count;

            mc.d.delta_methods_count = result;

            api_stats[mc.d.api_spec_id].delta_methods_count_sum = api_stats[mc.d.api_spec_id].delta_methods_count_sum || 0;
            api_stats[mc.d.api_spec_id].delta_methods_count_sum += mc.d.delta_methods_count;

            // if (result < -DELTA_CLASS_LARGE) {
            //     mc.d.delta_methods_count_class = 0;
            // } else if (result < -DELTA_CLASS_SMALL) {
            //     mc.d.delta_methods_count_class = 1;
            // } else if (result < -DELTA_CLASS_TINY) {
            //     mc.d.delta_methods_count_class = 2;
            // } else if (result < 0) {
            //     mc.d.delta_methods_count_class = 3;
            // } else if (result == 0) {
            //     mc.d.delta_methods_count_class = 4;
            // } else if (result > DELTA_CLASS_LARGE) {
            //     mc.d.delta_methods_count_class = 8;
            // } else if (result > DELTA_CLASS_SMALL) {
            //     mc.d.delta_methods_count_class = 7;
            // } else if (result > DELTA_CLASS_TINY) {
            //     mc.d.delta_methods_count_class = 6;
            // } else {
            //     mc.d.delta_methods_count_class = 5;
            // }


            if (result < -DELTA_CLASS_TINY) {
                mc.d.delta_methods_count_class = 0;
            } else if (result > DELTA_CLASS_TINY) {
                mc.d.delta_methods_count_class = 2;
            } else {
                mc.d.delta_methods_count_class = 1;
            }

            hist_delta_methods_counts[mc.d.delta_methods_count_class] = hist_delta_methods_counts[mc.d.delta_methods_count_class] || 0;
            hist_delta_methods_counts[mc.d.delta_methods_count_class]++;
            
        });

    });
    
    console.log("Delta methods counts", hist_delta_methods_counts);

    // fs.writeJSONSync(OUTPUT_FOLDER + "hist_delta_methods_counts.json", hist_delta_methods_counts);
    fs.writeJSONSync(OUTPUT_FOLDER_COMPRESSED + "hist_delta_methods_counts.json", hist_delta_methods_counts);


    data.forEach(d=>{

        api_stats[d.api_spec_id].delta_class = api_stats[d.api_spec_id].delta_class || Array(9).fill(0);
        api_stats[d.api_spec_id].delta_class[d.delta_methods_count_class]++;

    });

    console.log("Encoding");

    //encode the delta_class as a number precisely
    Object.keys(api_stats).forEach(k=>{

        api_stats[k].delta_class_number = 0;
        api_stats[k].delta_class.forEach((c,i)=>{
            if (c > 0) {
                api_stats[k].delta_class_number += Math.pow(2, i);
            }
        });

    });

    return {data, apis, api_stats, ndates, dates};

}

let METADATA_WRITTEN = false;

function sorter(n, output, asort, MIN_COMMITS) {

    let data = n.data;
    let apis = [...n.apis];
    let api_stats = n.api_stats;
    let ndates = n.ndates;
    let dates = n.dates;

    apis = asort(apis, api_stats);

    if (MIN_COMMITS === undefined) {

        var napis = data.map(d=>apis.indexOf(parseInt(d.api_spec_id))/apis.length);

    } else {

        let filtered = apis.filter(a=>api_stats[a].count <= MIN_COMMITS).length;

        //flatten out the apis with only one commit
        var napis = data.map(d=>{

            if (api_stats[d.api_spec_id].count <= MIN_COMMITS) {
                return 0;
            } else {
                return (apis.indexOf(parseInt(d.api_spec_id)) - filtered)/(apis.length - filtered)
            }
        
        });

    }

    console.log("Napis", napis.length);

    let points = data.map((d,i)=>[napis[i], ndates[i]]);

    write(output, points);

    let map = [];
    reverse_api_spec_ids.forEach((k,i)=>{
        map[i] = [apis.indexOf(i)/apis.length, k];

        //TODO what if filtered?
    });

    write(output.replace(".json", "_map.json"), map);

    write(output.replace(".json", "_t.json"), dates.map(d=>d.format('DD/MM/YYYY HH:mm:ss')));

    console.log(output, points.length);

    points = data.map((d,i)=>[napis[i], d.normalized_commit_age]);

    write(output.replace(".json", "_normtime.json"), points);

    let max_commit_age = data.map(d=>d.commit_age).reduce((a,b)=>Math.max(a,b), 0);

    points = data.map((d,i)=>[napis[i], d.commit_age/max_commit_age]);

    write(output.replace(".json", "_normage.json"), points);

    let max_commit_age_reverse = data.map(d=>d.commit_age_reverse).reduce((a,b)=>Math.max(a,b), 0);

    points = data.map((d,i)=>[napis[i], (max_commit_age_reverse-d.commit_age_reverse)/max_commit_age_reverse]);

    write(output.replace(".json", "_normage_reverse.json"), points);

    //center the points Y
    let max_commit_age_middle = data.map(d=>d.commit_age_middle).reduce((a,b)=>Math.max(a,b), 0);
    let min_commit_age_middle = data.map(d=>d.commit_age_middle).reduce((a,b)=>Math.min(a,b), 0);
    let range_commit_age_middle = max_commit_age_middle - min_commit_age_middle;

    console.log("Middle", min_commit_age_middle, max_commit_age_middle, range_commit_age_middle);

    points = data.map((d,i)=>[napis[i], (d.commit_age_middle-min_commit_age_middle)/range_commit_age_middle]);

    write(output.replace(".json", "_normage_center.json"), points);

    if (METADATA_WRITTEN) {
        return;
    }
    METADATA_WRITTEN = true;

    let metadata = data.map((d,i)=>[parseInt(d.methods_count), dates[i].year(), d.short_extension, d.delta_methods_count_class]);

    write("metadata.json", metadata);

    function write(filename, points) {
        // fs.writeJSONSync(OUTPUT_FOLDER + filename, points);
        fs.writeJSONSync(OUTPUT_FOLDER_COMPRESSED + filename, compress(points));
    }
   
}