/*
Type and hapax accumulation curves
Copyright (C) 2007  Jukka Suomela

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA

To contact the author, email jukka.suomela@cs.helsinki.fi
or see http://www.cs.helsinki.fi/jukka.suomela/ for further
details. See http://www.cs.helsinki.fi/jukka.suomela/types/ for
more information on this program.
*/

/**
    \mainpage Type and hapax accumulation curves

    A program for computing accumulation curves of types and hapaxes. Uses permutation testing: constructs a number of random permutations of the input, and finds the upper and lower bounds for each significance level. For more information on the program, see the <a href="http://www.cs.helsinki.fi/jukka.suomela/types/">web site</a>.

    Copyright (C) 2007 <a href="http://www.cs.helsinki.fi/jukka.suomela/">Jukka Suomela</a>. This program comes with ABSOLUTELY NO WARRANTY. This is free software, and you are welcome to redistribute it under the terms of the <a href="http://www.gnu.org/licenses/gpl.html">GNU General Public License</a>.

    \file types.c
    \brief Source code.
*/

// #define NDEBUG
#include <assert.h>
#include <errno.h>
#include <limits.h>
#include <math.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/// \addtogroup version Version information
/// @{

/// Version number.
#define VERSION "2007-05-15"

/// Year for the copyright.
#define YEAR "2007"

/// @}
/// \addtogroup configurable Configurable part
/// @{

/// The significance levels to report.
/** A comma-separated list of decimal numbers in the range from 0.0 to 0.5. Used to initialise #LEVELS. */
#define LEVELS_LIST 0.0001, 0.001, 0.01, 0.05, 0.10

/// @}
/// \addtogroup performance Performance tuning
/// These settings can be changed to tune the performance on different platforms.
/// @{

#ifndef WORD_BITS
/// The size of the words that are used to store the bit vectors.
/** Possible values: 32 and 64. */
#  define WORD_BITS 32
#endif

#ifndef BITCOUNT_BITS
/// The size of the precomputed arrays of bit counts.
/** Possible values: 8, 11 or 16. Each word is divided into fragments of length \c BITCOUNT_BITS. An array is used to look up the number of ones in each fragment. The largest value is not necessarily the best, as smaller arrays may be more cache-friendly; on Pentium 4 platform, the value 11 seems to be the best choice. */
#  define BITCOUNT_BITS 11
#endif

/// The type which is used to store the bit counts.
/** The maximum value that needs to be stored is #WORD_BITS, therefore any unsigned type is ok. */
typedef unsigned bitcount_t;

/// @}
/// \addtogroup bitmasks Bit masks
/// @{

#if WORD_BITS == 32
/// The type which is used to store the bit vectors.
typedef uint_fast32_t word_t;
/// Value 1 in the type #word_t.
#  define WORD_ONE UINT32_C(1)
/// Base-2 logarithm of #WORD_BITS
#  define MSB_SHIFT 5
#elif WORD_BITS == 64
typedef uint_fast64_t word_t;
#  define WORD_ONE UINT64_C(1)
#  define MSB_SHIFT 6
#else
#  error "Unsupported WORD_BITS"
#endif

/// A bitmask for extracting the least significant bits of an index.
#define LSB_MASK ((WORD_ONE << MSB_SHIFT) - WORD_ONE)

/// Get the most significant bits of the index \a i.
/** The most significant bits choose the word where the boolean value is stored. Cf. #get_lsb_bit. */
inline static size_t
get_msb_index(size_t i)
{
    return i >> MSB_SHIFT;
}

/// Get the least significant bits of the index \a i.
/** The least significant bits choose the bit within a word. Cf. #get_msb_index. */
inline static word_t
get_lsb_bit(size_t i)
{
    return WORD_ONE << (i & LSB_MASK);
}

/// @}
/// \addtogroup utils Utilities
/// General-purpose macros.
/// @{

/// Used to implement #STRINGIFY.
#define STRINGIFY2(x) #x
/// Stringify \a x.
#define STRINGIFY(x) STRINGIFY2(x)

/// Minimum of \a x and \a y.
#define MIN(x, y) ((x) < (y) ? (x) : (y))
/// Maximum of \a x and \a y.
#define MAX(x, y) ((x) > (y) ? (x) : (y))

/// @}
/// \addtogroup malloc Memory allocation
/// Memory allocation and \c size_t arithmetics.
/// @{

/// Allocate \a count elements of type \a type and store the pointer to \a target.
#define MYMALLOC(target, type, count) \
    do { \
        type *my_result = mymalloc(size_multiply(sizeof(type), count)); \
        target = my_result; \
    } while (0)

/// Allocate \a count elements of type \a type, initialise each element to \a init, and store the pointer to \a target.
#define MYMALLOCZ(target, type, count, init) \
    do { \
        size_t my_count = count; \
        type *my_result = mymalloc(size_multiply(sizeof(type), my_count)); \
        for (size_t my_i = 0; my_i < my_count; my_i++) { \
            my_result[my_i] = init; \
        } \
        target = my_result; \
    } while (0)

/// Allocate \a count1 times \a count2 elements of type \a type, initialise each element to \a init, and store the pointer to \a target.
#define MYMALLOC2Z(target, type, count1, count2, init) \
    MYMALLOCZ(target, type, size_multiply(count1, count2), init)

/// Multiply \a a by \a b; exit with failure if an overflow occurs.
static size_t
size_multiply(size_t a, size_t b)
{
    size_t x = a;
    x *= b;
    if (x / b != a) {
        fprintf(stderr, "size overflow (%zu * %zu > %zu)\n", a, b, SIZE_MAX);
        exit(EXIT_FAILURE);
    }
    return x;
}

/// Allocate \a s bytes; exit with failure if out of memory.
static void *
mymalloc(size_t s)
{
    void *p = malloc(s);
    if (p == NULL) {
        fprintf(stderr, "malloc failed (%zu bytes)\n", s);
        exit(EXIT_FAILURE);
    }
    return p;
}

/// @}
/// \addtogroup io I/O utilities
/// @{

/// A wrapper of \c fscanf.
/** Exit with failure unless \a expected elements were successfully read. As a special case, \a expected can also be \c NULL. The string \a description is used in error messages. */
static void
myfscanf(int expected, const char *description, FILE *stream, const char *format, ...)
{
    va_list ap;
    va_start(ap, format);
    int result = vfscanf(stream, format, ap);
    va_end(ap);

    if (expected != result) {
        if (result == EOF) {
            fprintf(stderr, "%s: expected %d fields, got EOF: %s\n", description, expected, format);
        } else if (expected == EOF) {
            fprintf(stderr, "%s: expected EOF, got %d fields: %s\n", description, result, format);
        } else {
            fprintf(stderr, "%s: expected %d fields, got %d fields: %s\n", description, expected, result, format);
        }
        exit(EXIT_FAILURE);
    }
}

/// Report an error; exit with failure.
static void
bad_uint(const char *s)
{
    fprintf(stderr, "not a valid unsigned integer: %s\n", s);
    exit(EXIT_FAILURE);
}

/// Parse an unsigned integer in the string \a s; exit with failure if unsuccessful.
static unsigned
get_uint(const char *s)
{
    if (*s == '\0') {
        bad_uint(s);
    }
    char *pend;
    errno = 0;
    unsigned long v = strtoul(s, &pend, 10);
    if (*pend != '\0') {
        bad_uint(s);
    }
    if (v == ULONG_MAX && errno == ERANGE) {
        bad_uint(s);
    }
#if UINT_MAX < ULONG_MAX
    if (v > UINT_MAX) {
        bad_uint(s);
    }
#endif
    return (unsigned int)v;
}

/// @}
/// \addtogroup builtin_rng Built-in random number generator
/// This part implements a simple pseudorandom number generator. The generator produces the same sequence of numbers on all platforms; it is intended for tests which require reproducible results.
/// @{

/// The number of random bits returned by #builtin_rng.
#define BUILTIN_RNG_BITS 15

/// The maximum value returned by #builtin_rng.
#define BUILTIN_RNG_MAX ((1U << BUILTIN_RNG_BITS) - 1)

/// The number of random bits returned by #builtin_rng_large.
#define BUILTIN_RNG_LARGE_BITS (BUILTIN_RNG_BITS * 2)

/// The maximum value returned by #builtin_rng_large.
#define BUILTIN_RNG_LARGE_MAX ((1U << BUILTIN_RNG_LARGE_BITS) - 1)

/// The internal state of #builtin_rng.
typedef uint_fast32_t builtin_rng_state_t;

/// Initialise the state used by #builtin_rng.
inline static void
builtin_rng_init(builtin_rng_state_t *state)
{
    *state = 1;
}

/// Builtin pseudorandom number generator.
/** Return a pseudorandom unsigned integer between \c 0 and #BUILTIN_RNG_MAX. The algorithm is mentioned in the C standard (ISO/IEC 9899:1999), and it is also used in the GNU C Library to implement \c rand_r. */
inline static uint_fast16_t
builtin_rng(builtin_rng_state_t *state)
{
    *state = *state * 1103515245 + 12345;
    return (*state >> 16) & BUILTIN_RNG_MAX;
}

/// Builtin pseudorandom number generator.
/** Return a pseudorandom unsigned integer between \c 0 and #BUILTIN_RNG_LARGE_MAX. */
inline static uint_fast32_t
builtin_rng_large(builtin_rng_state_t *state)
{
    uint_fast32_t value = builtin_rng(state) << BUILTIN_RNG_BITS;
    value |= builtin_rng(state);
    return value;
}

/// Return a random unsigned integer between \c 0 and \c n-1.
inline static unsigned
builtin_rng_n(builtin_rng_state_t *state, unsigned n)
{
    return (unsigned)((double)n * (builtin_rng_large(state) / (BUILTIN_RNG_LARGE_MAX + 1.0)));
}

/// @}
/// \addtogroup perm Random permutations
/// @{

/// Return a random unsigned integer between \c 0 and \c n-1.
/** Uses the random number generator in the C library. */
inline static unsigned
myrand_n(unsigned n)
{
    return (unsigned)((double)n * (rand() / (RAND_MAX + 1.0)));
}

/// Swap \a i and \a j in \a table.
inline static void
myswap(unsigned i, unsigned j, unsigned * restrict table)
{
    unsigned tmp = table[j];
    table[j] = table[i];
    table[i] = tmp;
}

/// Store the identity permutation of \a n elements into \a table.
inline static void
identity_permutation(unsigned n, unsigned * restrict table)
{
    for (unsigned i = 0; i < n; i++) {
        table[i] = i;
    }
}

/// Store a random permutation of \a n elements into \a table.
/** Uses the random number generator from the C library. */
inline static void
rand_permutation(unsigned n, unsigned * restrict table)
{
    identity_permutation(n, table);
    for (unsigned i = 0; i < n; i++) {
        unsigned j = myrand_n(n - i) + i;
        myswap(i, j, table);
    }
}

/// Store a random permutation of \a n elements into \a table.
/** Uses #builtin_rng. */
inline static void
builtin_rng_permutation(builtin_rng_state_t * restrict state, unsigned n, unsigned * restrict table)
{
    identity_permutation(n, table);
    for (unsigned i = 0; i < n; i++) {
        unsigned j = builtin_rng_n(state, n - i) + i;
        myswap(i, j, table);
    }
}

/// @}
/// \addtogroup bitcount Bit counting
/// @{

/// Compute the number of ones in the binary representation of \a w.
/** A naive algorithm which does not use the array #bitcounts. */
inline static unsigned
naive_bitcount(unsigned w)
{
    unsigned c = 0;
    while (w != 0) {
        c += (w & 1U);
        w >>= 1;
    }
    return c;
}

/// A lookup table where precomputed bit counts are stored.
static bitcount_t bitcounts[1U << BITCOUNT_BITS];

/// Initialise the array #bitcount.
static void
init_bitcount(void)
{
    for (unsigned i = 0; i < (1U << BITCOUNT_BITS); i++) {
        bitcounts[i] = naive_bitcount(i);
    }
}

/// Compute the number of ones in the binary representation of \a w.
inline static unsigned
bitcount(word_t w)
{
    return
#if BITCOUNT_BITS == 8
        bitcounts[w & 0xFFU]
        + bitcounts[(w >> 8) & 0xFFU]
        + bitcounts[(w >> 16) & 0xFFU]
        + bitcounts[(w >> 24) & 0xFFU]
#  if WORD_BITS == 64
        + bitcounts[(w >> 32) & 0xFFU]
        + bitcounts[(w >> 40) & 0xFFU]
        + bitcounts[(w >> 48) & 0xFFU]
        + bitcounts[(w >> 56) & 0xFFU]
#  endif
#elif BITCOUNT_BITS == 11
        bitcounts[w & 0x7FFU]
        + bitcounts[(w >> 11) & 0x7FFU]
        + bitcounts[(w >> 22) & 0x7FFU]
#  if WORD_BITS == 64
        + bitcounts[(w >> 33) & 0x7FFU]
        + bitcounts[(w >> 44) & 0x7FFU]
        + bitcounts[(w >> 55) & 0x7FFU]
#  endif
#elif BITCOUNT_BITS == 16
        bitcounts[w & 0xFFFFU]
        + bitcounts[(w >> 16) & 0xFFFFU]
#  if WORD_BITS == 64
        + bitcounts[(w >> 32) & 0xFFFFU]
        + bitcounts[(w >> 48) & 0xFFFFU]
#  endif
#else
#  error "Unsupported BITCOUNT_BITS"
#endif
        ;
}

/// Bit-parallel arithmetic on numbers 0, 1, more.
typedef struct {
    /// A bit is set if the value is at least 1.
    word_t at_least_1;
    /// A bit is set if the value is at least 2.
    word_t at_least_2;
} zom_t;

/// Calculate \a x + \a y in zom_t.
inline static zom_t zom_add(zom_t x, zom_t y)
{
    zom_t z;
    z.at_least_1 = x.at_least_1 | y.at_least_1;
    z.at_least_2 = x.at_least_2 | y.at_least_2 | (x.at_least_1 & y.at_least_1);
    return z;
}

/// Return a bit vector where a bit is set if the value is exactly 1.
inline static word_t zom_exactly_1(zom_t x)
{
    return x.at_least_1 & ~x.at_least_2;
}

/// Zero an array of #word_t.
inline static void
word_clear(word_t * restrict array, size_t n)
{
    for (unsigned i = 0; i < n; i++) {
        array[i] = 0;
    }
}

/// Zero an array of #zom_t.
inline static void
zom_clear(zom_t * restrict array, size_t n)
{
    for (unsigned i = 0; i < n; i++) {
        array[i].at_least_1 = 0;
        array[i].at_least_2 = 0;
    }
}

/// @}
/// \addtogroup input Input
/// User interface for reading the input.
/// @{

/// Print documentation and exit.
static void
usage(const char *name)
{
    printf(
        "Type and hapax accumulation curves\n"
        "\n"
        "usage:\n"
        "  %s [OPTION]... ITERATIONS SLOTS\n"
        "  %s [OPTION]... ITERATIONS --slot-size SLOT_SIZE\n"
        "\n"
        "  --help  This help.\n"
        "  --version  Version and copyright information.\n"
        "  --slot-size  Spesify the size of a slot instead of the number of the slots.\n"
        "  --hapax  Count hapaxes only.\n"
        "  --items-from-table  Item counts from the incidence matrix.\n"
        "  --brief  Skip redundant identical rows in output.\n"
        "  --raw-data  Output data for each permutation.\n"
        "  --nonrandom  For debugging: use the identity permutation.\n"
        "  --builtin-rng  For debugging: use the built-in pseudorandom number generator.\n"
        "\n"
        "  ITERATIONS  The number of iterations.\n"
        "  SLOTS  The number of slots.\n"
        "  SLOT_SIZE  The size of each slot.\n"
        "\n"
        "Input format (fields separated by any whitespace):\n"
        "\n"
        "  <number-of-samples> <number-of-types>\n"
        "  <sample-1-label> <sample-1-total-items> ...\n"
        "  <sample-m-label> <sample-m-total-items>\n"
        "  <type-1-label> ...\n"
        "  <type-n-label>\n"
        "  <count-sample-1-type-1> ... <count-sample-1-type-n> ...\n"
        "  <count-sample-m-type-1> ... <count-sample-m-type-n> ...\n"
        "\n"
        "Output format (one line for each slot):\n"
        "\n"
        "  <item-count> <lower-bounds> ... <upper-bounds> ...\n"
        "  ...\n"
        "\n"
        "Lower and upper bounds are printed for the following quantiles:\n"
        "\n"
        "  " STRINGIFY((LEVELS_LIST)) "\n"
        "\n"
        "Output format for --raw-data:\n"
        "\n"
        "  <item-count> <value>\n"
        "  ...\n"
        "\n",
        name, name);

    exit(EXIT_SUCCESS);
}

/// Print version information and exit.
static void
version(void)
{
    printf(
        "Type and hapax accumulation curves, version " VERSION "\n"
        "Copyright (C) " YEAR "  Jukka Suomela\n"
        "\n"
        "This program comes with ABSOLUTELY NO WARRANTY.\n"
        "This is free software, and you are welcome to redistribute it\n"
        "under the terms of the GNU General Public License\n"
        "<http://www.gnu.org/licenses/gpl.html>.\n\n"
    );
    exit(EXIT_SUCCESS);
}

/// Input.
typedef struct {
    /// Binary incidence matrix for type counting.
    /** Each row of #type_words consecutive elements corresponds to a sample. */
    word_t * restrict incidenceb;
    /// Zero-one-more incidence matrix for hapax counting.
    /** Organised in the same way as #incidenceb. */
    zom_t * restrict incidencezom;
    /// The number of iterations.
    /** Always defined by #parse_command_line. */
    unsigned iterations;
    /// The number of slots.
    /** Not used if #raw_data. If #slot_size is true, this is defined by #prepare_slots based on #requested_slot_size; otherwise it is defined already in #parse_command_line. At least 2. */
    unsigned slots;
    /// The size of each slot.
    /** Positive. Defined if #slot_size is true. */
    unsigned requested_slot_size;
    /// The number of samples.
    /** Positive. */
    unsigned nsample;
    /// The number of types.
    /** Positive. */
    unsigned ntype;
    /// Memory words per sample.
    /** Positive. */
    unsigned type_words;
    /// The total number of items.
    /** Positive. */
    unsigned total_items;
    /// Items per sample.
    /** Positive. */
    unsigned *sample_items;
    /// Specify the size of a slot instead of the number of the slots.
    bool slot_size;
    /// Get the item counts from the incidence matrix instead of using a separate vector.
    bool items_from_table;
    /// Count the number of hapaxes only.
    /** By default, the program computes the number of all types. Hapaxes are types which have occurred exactly once so far. */
    bool hapax;
    /// Skip redundant identical rows in output.
    /** If there are 3 or more identical rows, print only the first one and the last one. */
    bool brief;
    /// Output data for each permutation.
    /** By default, the program accumulates information on each permutation for each slot; only a summary of the results is printed. If \c raw_data is set, then the raw type (or hapax) accumulation curve is printed for each permutation. */
    bool raw_data;
    /// For debugging: do not randomise but use the identity permutation.
    bool nonrandom;
    /// For debugging: use the built-in pseudorandom number generator.
    bool builtin_rng;
} input_t;

/// Parse the command line.
/** Set the relevant parts of \a pinput. Exit with failure if unsuccessful. Print usage information and exit successfully if the command line switch \c -h or \c --help is specified. */
static void
parse_command_line(input_t * restrict pinput, int argc, char **argv)
{
    pinput->iterations = 0;
    pinput->slots = 0;
    pinput->requested_slot_size = 0;
    pinput->hapax = false;
    pinput->slot_size = false;
    pinput->items_from_table = false;
    pinput->brief = false;
    pinput->raw_data = false;
    pinput->nonrandom = false;
    pinput->builtin_rng = false;

    for (int i = 1; i < argc; i++) {
        if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
            usage(argv[0]);
        } else if (strcmp(argv[i], "--version") == 0 || strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "-V") == 0) {
            version();
        } else if (strcmp(argv[i], "--hapax") == 0) {
            pinput->hapax = true;
        } else if (strcmp(argv[i], "--slot-size") == 0) {
            pinput->slot_size = true;
        } else if (strcmp(argv[i], "--items-from-table") == 0) {
            pinput->items_from_table = true;
        } else if (strcmp(argv[i], "--raw-data") == 0) {
            pinput->raw_data = true;
        } else if (strcmp(argv[i], "--brief") == 0) {
            pinput->brief = true;
        } else if (strcmp(argv[i], "--nonrandom") == 0) {
            pinput->nonrandom = true;
        } else if (strcmp(argv[i], "--builtin-rng") == 0) {
            pinput->builtin_rng = true;
        } else if (pinput->iterations == 0) {
            pinput->iterations = get_uint(argv[i]);
            if (pinput->iterations == 0) {
                fprintf(stderr, "invalid arguments: the number of iterations cannot be 0\n");
                exit(EXIT_FAILURE);
            }
        } else if (!pinput->raw_data && !pinput->slot_size && pinput->slots == 0) {
            pinput->slots = get_uint(argv[i]);
            if (pinput->slots < 2) {
                fprintf(stderr, "invalid arguments: the number of slots has to be at least 2\n");
                exit(EXIT_FAILURE);
            }
        } else if (!pinput->raw_data && pinput->slot_size && pinput->requested_slot_size == 0) {
            pinput->requested_slot_size = get_uint(argv[i]);
            if (pinput->requested_slot_size == 0) {
                fprintf(stderr, "invalid arguments: the slot size cannot be 0\n");
                exit(EXIT_FAILURE);
            }
        } else {
            fprintf(stderr, "too many command line arguments: '%s'\n", argv[i]);
            exit(EXIT_FAILURE);
        }
    }

    if (pinput->iterations == 0) {
        fprintf(stderr, "invalid arguments: the number of iterations not given\n");
        exit(EXIT_FAILURE);
    }
    if (!pinput->raw_data) {
        if (pinput->slot_size) {
            if (pinput->requested_slot_size == 0) {
                fprintf(stderr, "invalid arguments: the slot size not given\n");
                exit(EXIT_FAILURE);
            }
        } else {
            if (pinput->slots == 0) {
                fprintf(stderr, "invalid arguments: the number of iterations not given\n");
                exit(EXIT_FAILURE);
            }
        }
    }
}

/// Read the input file.
/** Allocate memory for input and set the relevant parts of \a pinput. Exit with failure if unsuccessful. */
static void
process_input(input_t * restrict pinput)
{
    myfscanf(2, "<number-of-samples> <number-of-types>", stdin, "%u %u", &pinput->nsample, &pinput->ntype);
    if (pinput->nsample == 0) {
        fprintf(stderr, "invalid input: the number of samples has to be at least 1\n");
        exit(EXIT_FAILURE);
    }
    if (pinput->ntype == 0) {
        fprintf(stderr, "invalid input: the number of types has to be at least 1\n");
        exit(EXIT_FAILURE);
    }

    pinput->type_words = (pinput->ntype + WORD_BITS - 1) / WORD_BITS;
    pinput->total_items = 0;
    MYMALLOC(pinput->sample_items, unsigned, pinput->nsample);
    for (unsigned i = 0; i < pinput->nsample; i++) {
        unsigned v;
        myfscanf(1, "<sample-label> <sample-total-items>", stdin, "%*s %u", &v);
        if (pinput->items_from_table) {
            pinput->sample_items[i] = 0;
        } else {
            if (v == 0) {
                fprintf(stderr, "invalid input in sample %u: the number of items has to be at least 1\n", i);
                exit(EXIT_FAILURE);
            }
            pinput->sample_items[i] = v;
            pinput->total_items += pinput->sample_items[i];
        }
    }
    for (unsigned i = 0; i < pinput->ntype; i++) {
        myfscanf(0, "<type-label>", stdin, "%*s");
    }
    if (pinput->hapax) {
        const zom_t zero = { 0, 0 };
        MYMALLOC2Z(pinput->incidencezom, zom_t, pinput->nsample, pinput->type_words, zero);
    } else {
        MYMALLOC2Z(pinput->incidenceb, word_t, pinput->nsample, pinput->type_words, 0);
    }
    for (unsigned i = 0; i < pinput->nsample; i++) {
        for (unsigned j = 0; j < pinput->ntype; j++) {
            unsigned v;
            myfscanf(1, "<count-sample-type>", stdin, "%u", &v);
            if (v > 0) {
                size_t pos = (size_t)i * pinput->type_words + get_msb_index(j);
                word_t mask = get_lsb_bit(j);
                if (pinput->hapax) {
                    pinput->incidencezom[pos].at_least_1 |= mask;
                    if (v > 1) {
                        pinput->incidencezom[pos].at_least_2 |= mask;
                    }
                } else {
                    pinput->incidenceb[pos] |= mask;
                }
            }
            if (pinput->items_from_table) {
                pinput->sample_items[i] += v;
                pinput->total_items += v;
            }
        }
    }
    myfscanf(EOF, "end of input", stdin, "%*s");

    if (pinput->total_items == 0) {
        fprintf(stderr, "invalid input: the total number of items has to be at least 1\n");
        exit(EXIT_FAILURE);
    }
}

/// Frees the memory allocated for the fields of #input_t.
static void
free_input(const input_t *pinput)
{
    free(pinput->sample_items);
    if (pinput->hapax) {
        free(pinput->incidencezom);
    } else {
        free(pinput->incidenceb);
    }
}

/// @}
/// \addtogroup compute Computation
/// @{

/// Output.
typedef struct {
    /// Slot threshold.
    unsigned * restrict slot_threshold;
    /// Lower bounds.
    /** Let f(i) = "a lower bound for the value (types or hapaxes) in the closed range from slot_threshold[i] items to slot_threshold[i + 1] items". Then lower_bound[j * input.slots + i] = number of permutations such that f(i) = j. See #record. */
    unsigned * restrict lower_bound;
    /// Upper bounds.
    /** Let g(i) = "an upper bound for the value (types or hapaxes) in the closed range from slot_threshold[i] items to slot_threshold[i + 1] items". Then upper_bound[j * input.slots + i] = number of permutations such that f(i) = j. See #record. */
    unsigned * restrict upper_bound;
} output_t;

/// Bounds for values.
typedef struct {
    /// Lower bound.
    unsigned lower;
    /// Upper bound.
    unsigned upper;
} bounds_t;

/// Prepare output_t::slot_threshold.
/** Also computes input_t::slots if needed. */
static void
prepare_slots(input_t * restrict pinput, output_t * restrict poutput)
{
    if (pinput->slot_size) {
        assert(pinput->slots == 0);
        pinput->slots = (pinput->total_items + pinput->requested_slot_size - 1) / pinput->requested_slot_size + 1;
    }
    MYMALLOC(poutput->slot_threshold, unsigned, pinput->slots);
    if (pinput->slot_size) {
        for (unsigned i = 0; i < pinput->slots - 1; i++) {
            poutput->slot_threshold[i] = i * pinput->requested_slot_size;
        }
    } else {
        for (unsigned i = 0; i < pinput->slots; i++) {
            poutput->slot_threshold[i] = (unsigned)lround((double)pinput->total_items * (double)i / (double)(pinput->slots - 1));
        }
    }
    poutput->slot_threshold[0] = 0;
    poutput->slot_threshold[pinput->slots - 1] = pinput->total_items;
}

/// Construct the next permutation in \a sample_order, based on the user's choice on how to compute the permutations.
inline static void
next_permutation(builtin_rng_state_t * restrict rng_state, const input_t *pinput, unsigned * restrict sample_order)
{
    if (pinput->nonrandom) {
        identity_permutation(pinput->nsample, sample_order);
    } else if (pinput->builtin_rng) {
        builtin_rng_permutation(rng_state, pinput->nsample, sample_order);
    } else {
        rand_permutation(pinput->nsample, sample_order);
    }
}

/// Process the sample number \a sample. Update type accumulation vector \a accum and type accumulation count in \a p_accum_types. Calculate type accumulation bounds for this sample and store the result in \a pb_sample.
inline static void
calculate_bounds_normal(const input_t *pinput, unsigned sample, word_t * restrict accum, unsigned * restrict p_accum_types, bounds_t * restrict pb_sample)
{
    const word_t * restrict vector = pinput->incidenceb + (size_t)sample * pinput->type_words;

    unsigned types = 0;
    for (unsigned j = 0; j < pinput->type_words; j++) {
        accum[j] |= vector[j];
        types += bitcount(accum[j]);
    }

    assert(*p_accum_types <= types);

    pb_sample->lower = *p_accum_types;
    pb_sample->upper = types;
    *p_accum_types = types;
}

/// Process the sample number \a sample. Update hapax accumulation vector \a accum and hapax accumulation count in \a p_accum_hapaxes. Calculate hapax accumulation bounds for this sample and store the result in \a pb_sample.
inline static void
calculate_bounds_hapax(const input_t *pinput, unsigned sample, zom_t * restrict accum, unsigned * restrict p_accum_hapaxes, bounds_t * restrict pb_sample)
{
    const zom_t * restrict vector = pinput->incidencezom + (size_t)sample * pinput->type_words;

    unsigned removed = 0;
    unsigned created = 0;
    unsigned temporary = 0;
    for (unsigned j = 0; j < pinput->type_words; j++) {
        const zom_t current = vector[j];
        const zom_t accum_old = accum[j];
        const zom_t accum_new = zom_add(accum_old, current);
        accum[j] = accum_new;

        const word_t hapax_old = zom_exactly_1(accum_old);
        const word_t hapax_new = zom_exactly_1(accum_new);
        const word_t hapax_temporary = ~accum_old.at_least_1 & current.at_least_2;
        assert((hapax_old & hapax_temporary) == 0);
        assert((hapax_new & hapax_temporary) == 0);
        temporary += bitcount(hapax_temporary);
        removed += bitcount(hapax_old & ~hapax_new);
        created += bitcount(~hapax_old & hapax_new);
    }

    assert(*p_accum_hapaxes >= removed);
    assert(*p_accum_hapaxes + created + temporary <= pinput->ntype);

    pb_sample->lower = *p_accum_hapaxes - removed;
    pb_sample->upper = *p_accum_hapaxes + created + temporary;
    *p_accum_hapaxes = *p_accum_hapaxes + created - removed;
}

/// Use the bounds for this sample in \a pb_sample and use it to update the bounds for this gap in \a pg_this_gap.
/** This function assumes that we are counting types. Cf. #update_bounds_hapax. */
inline static void
update_bounds_normal(bounds_t * restrict pb_this_gap, const bounds_t *pb_sample)
{
    // The number of accumulated types is nondecreasing.
    assert(pb_this_gap->lower <= pb_sample->lower);
    assert(pb_sample->lower <= pb_this_gap->upper);
    assert(pb_this_gap->upper <= pb_sample->upper);
    pb_this_gap->upper = pb_sample->upper;
}

/// Use the bounds for this sample in \a pb_sample and use it to update the bounds for this gap in \a pg_this_gap.
/** This function assumes that we are counting hapaxes. Cf. #update_bounds_normal. */
inline static void
update_bounds_hapax(bounds_t * restrict pb_this_gap, const bounds_t *pb_sample)
{
    pb_this_gap->lower = MIN(pb_sample->lower, pb_this_gap->lower);
    pb_this_gap->upper = MAX(pb_sample->upper, pb_this_gap->upper);
}

/// Store new per-slot bounds.
inline static void
record(unsigned slots, const output_t *poutput, bounds_t b_slot, unsigned current_slot)
{
    poutput->lower_bound[(size_t)b_slot.lower * slots + current_slot]++;
    poutput->upper_bound[(size_t)b_slot.upper * slots + current_slot]++;
}

/// Store new per-slot bounds based on per-gap bounds.
/** For example, if output_t#slot_threshold is 0, 10, 20, etc., and \a current_slot is 2, then \a b_prev_gap is the bounds for the gap 10..20, \a b_this_gap is the bounds for the gap 20..30 and we update the bounds for the slot 20. This function assumes that we are counting types. Cf. #record_hapax. */
inline static void
record_normal(unsigned slots, const output_t *poutput, bounds_t b_prev_gap, bounds_t b_this_gap, unsigned current_slot)
{
    // The number of accumulated types is nondecreasing.
    assert(b_prev_gap.lower <= b_this_gap.lower);
    assert(b_prev_gap.upper <= b_this_gap.upper);
    const bounds_t b_slot = {
        b_prev_gap.lower,
        b_this_gap.upper
    };
    record(slots, poutput, b_slot, current_slot);
}

/// Store new per-slot bounds based on per-gap bounds.
/** This function assumes that we are counting hapaxes. Cf. #record_normal. */
inline static void
record_hapax(unsigned slots, const output_t *poutput, bounds_t b_prev_gap, bounds_t b_this_gap, unsigned current_slot)
{
    // Here we need to compute the minimums and maximums.
    const bounds_t b_slot = {
        MIN(b_prev_gap.lower, b_this_gap.lower),
        MAX(b_prev_gap.upper, b_this_gap.upper)
    };
    record(slots, poutput, b_slot, current_slot);
}

/// A macro for constructing #calculate_statistics_normal and #calculate_statistics_hapax.
#define DEF_CALCULATE_STATISTICS(suffix, representation) \
static void \
calculate_statistics_ ## suffix(const input_t * restrict pinput, const output_t *poutput) \
{ \
    builtin_rng_state_t rng_state; \
    builtin_rng_init(&rng_state); \
    \
    for (unsigned iteration = 0; iteration < pinput->iterations; iteration++) { \
        unsigned sample_order[pinput->nsample]; \
        next_permutation(&rng_state, pinput, sample_order); \
        \
        representation ## _t accum[pinput->type_words]; \
        representation ## _clear(accum, pinput->type_words); \
        \
        unsigned accum_items = 0; \
        unsigned accum_value = 0; \
        unsigned current_slot = 0; \
        \
        bounds_t b_prev_gap = { 0, 0 }; \
        bounds_t b_this_gap = { 0, 0 }; \
        \
        for (unsigned i = 0; i < pinput->nsample; i++) { \
            const unsigned sample = sample_order[i]; \
            accum_items += pinput->sample_items[sample]; \
            \
            bounds_t b_sample; \
            calculate_bounds_ ## suffix(pinput, sample, accum, &accum_value, &b_sample); \
            update_bounds_ ## suffix(&b_this_gap, &b_sample); \
            \
            while (current_slot + 1 < pinput->slots && accum_items >= poutput->slot_threshold[current_slot + 1]) { \
                record_ ## suffix(pinput->slots, poutput, b_prev_gap, b_this_gap, current_slot); \
                current_slot++; \
                b_prev_gap = b_this_gap; \
                b_this_gap = b_sample; \
            } \
            if (accum_items == poutput->slot_threshold[current_slot]) { \
                b_this_gap.lower = accum_value; \
                b_this_gap.upper = accum_value; \
            } \
        } \
        \
        assert(current_slot == pinput->slots - 1); \
        record_ ## suffix(pinput->slots, poutput, b_prev_gap, b_this_gap, current_slot); \
    } \
}

/// Calculate the bounds for type accumulation.
DEF_CALCULATE_STATISTICS(normal, word)

/// Calculate the bounds for hapax accumulation.
DEF_CALCULATE_STATISTICS(hapax, zom)

/// Calculate the bounds for type or hapax accumulation.
static void
calculate_statistics(const input_t *pinput, output_t * restrict poutput)
{
    MYMALLOC2Z(poutput->lower_bound, unsigned, pinput->ntype+1, pinput->slots, 0);
    MYMALLOC2Z(poutput->upper_bound, unsigned, pinput->ntype+1, pinput->slots, 0);

    if (pinput->hapax) {
        calculate_statistics_hapax(pinput, poutput);
    } else {
        calculate_statistics_normal(pinput, poutput);
    }
}

/// Frees the memory allocated for the fields of #output_t.
static void
free_output(const output_t *poutput)
{
    free(poutput->slot_threshold);
    free(poutput->lower_bound);
    free(poutput->upper_bound);
}

/// @}
/// \addtogroup raw Raw data output
/// @{

/// Print a row of raw data.
static void
print_raw_data(unsigned accum_items, unsigned accum_value)
{
    printf("%8u %4u\n", accum_items, accum_value);
}

/// A macro for constructing #calculate_raw_data_normal and #calculate_raw_data_hapax.
#define DEF_CALCULATE_RAW_DATA(suffix, representation) \
static void \
calculate_raw_data_ ## suffix(const input_t *pinput) \
{ \
    builtin_rng_state_t rng_state; \
    builtin_rng_init(&rng_state); \
    \
    for (unsigned iteration = 0; iteration < pinput->iterations; iteration++) { \
        unsigned sample_order[pinput->nsample]; \
        next_permutation(&rng_state, pinput, sample_order); \
        \
        representation ## _t accum[pinput->type_words]; \
        representation ## _clear(accum, pinput->type_words); \
        \
        unsigned accum_items = 0; \
        unsigned accum_value = 0; \
        \
        print_raw_data(accum_items, accum_value); \
        for (unsigned i = 0; i < pinput->nsample; i++) { \
            const unsigned sample = sample_order[i]; \
            accum_items += pinput->sample_items[sample]; \
            \
            bounds_t b_sample; \
            calculate_bounds_ ## suffix(pinput, sample, accum, &accum_value, &b_sample); \
            \
            print_raw_data(accum_items, accum_value); \
        } \
    } \
}

/// Print raw data for type accumulation.
DEF_CALCULATE_RAW_DATA(normal, word)

/// Print raw data for hapax accumulation.
DEF_CALCULATE_RAW_DATA(hapax, zom)

/// Print raw data for type or hapax accumulation.
static void
calculate_raw_data(const input_t *pinput)
{
    if (pinput->hapax) {
        calculate_raw_data_hapax(pinput);
    } else {
        calculate_raw_data_normal(pinput);
    }
}

/// @}
/// \addtogroup output Output
/// @{

/// The significance levels to report
const double LEVELS[] = { LEVELS_LIST };

/// The number of elements in #LEVELS.
#define NLEVELS (sizeof LEVELS / sizeof LEVELS[0])

/// Prints a row of output.
static void
print_row(unsigned slot_threshold, const unsigned *lower_bounds, const unsigned *upper_bounds)
{
    printf("%8u", slot_threshold);
    for (unsigned l = 0; l < NLEVELS; l++) {
        printf(" %4u", lower_bounds[l]);
    }
    for (unsigned l = 0; l < NLEVELS; l++) {
        printf(" %4u", upper_bounds[NLEVELS - 1 - l]);
    }
    printf("\n");
}

/// Prints the results.
static void
print_result(const input_t *pinput, const output_t *poutput)
{
    unsigned prev_lower_bounds[NLEVELS];
    unsigned prev_upper_bounds[NLEVELS];
    bool prev_valid = false;
    bool prev_delayed = false;

    for (unsigned i = 0; i < pinput->slots; i++) {
        unsigned lower_bounds[NLEVELS];
        unsigned cum = 0;
        unsigned next_level = 0;
        for (unsigned j = 0; j <= pinput->ntype; j++) {
            cum += poutput->lower_bound[(size_t)j * pinput->slots + i];
            double fract = (double)cum / (double)pinput->iterations;
            while (next_level < NLEVELS && LEVELS[next_level] < fract) {
                lower_bounds[next_level] = j;
                next_level++;
            }
        }
        assert(next_level == NLEVELS);

        unsigned upper_bounds[NLEVELS];
        cum = 0;
        next_level = 0;
        for (unsigned j = 0; j <= pinput->ntype; j++) {
            cum += poutput->upper_bound[(size_t)(pinput->ntype - j) * pinput->slots + i];
            double fract = (double)cum / (double)pinput->iterations;
            while (next_level < NLEVELS && LEVELS[next_level] < fract) {
                upper_bounds[next_level] = pinput->ntype - j;
                next_level++;
            }
        }
        assert(next_level == NLEVELS);

        if (pinput->brief) {
            if (prev_valid) {
                bool same = true;
                for (unsigned l = 0; l < NLEVELS; l++) {
                    if (prev_lower_bounds[l] != lower_bounds[l] || prev_upper_bounds[l] != upper_bounds[l]) {
                        same = false;
                        break;
                    }
                }
                if (same) {
                    prev_delayed = true;
                } else if (prev_delayed) {
                    print_row(poutput->slot_threshold[i - 1], prev_lower_bounds, prev_upper_bounds);
                    prev_valid = false;
                } else {
                    prev_valid = false;
                }
            }
            if (!prev_valid) {
                print_row(poutput->slot_threshold[i], lower_bounds, upper_bounds);
                for (unsigned l = 0; l < NLEVELS; l++) {
                    prev_lower_bounds[l] = lower_bounds[l];
                    prev_upper_bounds[l] = upper_bounds[l];
                }
                prev_valid = true;
                prev_delayed = false;
            }
        } else {
            print_row(poutput->slot_threshold[i], lower_bounds, upper_bounds);
        }
    }

    if (pinput->brief && prev_valid && prev_delayed) {
        print_row(poutput->slot_threshold[pinput->slots - 1], prev_lower_bounds, prev_upper_bounds);
    }
}

/// @}
/// \addtogroup main Main program
/// @{

/// Main program.
int
main(int argc, char **argv)
{
    init_bitcount();

    input_t input;
    parse_command_line(&input, argc, argv);
    process_input(&input);

    if (input.raw_data) {
        calculate_raw_data(&input);
    } else {
        output_t output;
        prepare_slots(&input, &output);
        calculate_statistics(&input, &output);
        print_result(&input, &output);
        free_output(&output);
    }
    free_input(&input);
    return EXIT_SUCCESS;
}

/// @}
