#define INDEX(n,x,y)    ((x)*(n) +(y))

int mergesort_c(uint8_t a[], size_t sets, size_t columnsize) {
    // Set up variables
    uint32_t flag = 0, remsets = sets, length = sets*columnsize;
    size_t i, j, set = 0, index = 0;
    uint8_t min, cur;
    
    // Initialize and zero the counters[] array.
    size_t counters[sets];
    memset(counters, 0, sets*sizeof(size_t));
    // Allocate the memory for the final sorted array.
    uint8_t *target = malloc(length*sizeof(uint8_t));
    if (target == NULL) {
        printf("Error: mergesort_c(): malloc() \\
                failed to allocate %d bytes\n", length);
        exit(10);
    }
    
    // While we have remaining sets to sort, loop
    while (remsets) {
        // set up the initial min value to the max value, 
        // so that it gets reset immediately.
        min = UCHAR_MAX;

        // Loop over all the sets
        for (j=0; j < sets; j++) {
            // Check if this particular set has been finished
            // (its bit is set) otherwise consider it when 
            // searching for the min value.
            if (!((1 << j) & flag)) {
                cur = a[INDEX(sets, counters[j], j)];
                
                if (min >= cur) {
                    min = cur;
                    set = j;
                }
            }
        }
        // Insert the min value to the next available position 
        // in the final array
        target[index++] = min;
        // Increase the counter for this particular set
        counters[set]++;
        // If this counter exceeds the columnsize, then 
        // we have finished the set. Set its bit in 
        // the flag variable to 1 and decrease remsets.
        if (counters[set] >= columnsize) {
            flag |= (1 << set);
            remsets--;
        }
    }
    // Copy the target array to our given array and free target
    memcpy(a, target, length);
    if (target)
        free(target);

    return;
}

int mergesort_l(uint32_t a[], size_t sets, size_t columnsize) {
    // Set up variables
    uint32_t flag = 0, remsets = sets, length = sets*columnsize;
    size_t i, j, set = 0, index = 0;
    uint32_t min, cur;
    
    // Initialize and zero the counters[] array.
    size_t counters[sets];
    memset(counters, 0, sets*sizeof(size_t));
    // Allocate the memory for the final sorted array.
    uint32_t *target = malloc(length*sizeof(uint32_t));
    if (target == NULL) {
        printf("Error: mergesort_c(): malloc() \\
                failed to allocate %d bytes\n", length);
        exit(10);
    }
    
    // While we have remaining sets to sort, loop
    while (remsets) {
        // set up the initial min value to the max value, 
        // so that it gets reset immediately.
        min = UINT_MAX;

        // Loop over all the sets
        for (j=0; j < sets; j++) {
            // Check if this particular set has been finished
            // (its bit is set) otherwise consider it when 
            // searching for the min value.
            if (!((1 << j) & flag)) {
                cur = a[INDEX(sets, counters[j], j)];
                
                if (min >= cur) {
                    min = cur;
                    set = j;
                }
            }
        }
        // Insert the min value to the next available position 
        // in the final array
        target[index++] = min;
        // Increase the counter for this particular set
        counters[set]++;
        // If this counter exceeds the columnsize, then 
        // we have finished the set. Set its bit in 
        // the flag variable to 1 and decrease remsets.
        if (counters[set] >= columnsize) {
            flag |= (1 << set);
            remsets--;
        }
    }
    // Copy the target array to our given array and free target
    memcpy(a, target, length*sizeof(uint32_t));
    if (target)
        free(target);

    return;
}


int vecinssort_c(uint8_t a[], int length) {
    vector uint8_t va_key, va_cur, va_next, va_first, va_second;
    vector bool char va_cmpmask;
    vector uint8_t *cur;
    int i, j, k;
    if (length >= 32) {
        // How many sets of N/16 do we have to deal with?
        loops = length/16;
        for(i = 1; i < loops; i++) {
            // Get the key set of elements
            // and load it into an Altivec vector register
            cur = a + i*16;
            va_key = vec_ld(0, cur);
            // Compare all the previous sets to the key set.
            for (j = i-1; j >= 0; j--) {
                cur = a + j*16;
                /* Load current and next 16-byte sets 
                into Altivec registers
                */
                va_cur = vec_ld(0, cur);
                va_next = vec_ld(16, cur);
                
                /* Generate the comparison mask between 
                the current vector and the key vector
                */
                va_cmpmask = vec_cmpgt(va_cur, va_key);
                
                /* And construct the first and second 
                result vectors to replace the 2 16-byte sets
                */
                va_first = vec_sel(va_cur, va_key, va_cmpmask);
                va_second = vec_sel(va_next, va_cur, va_cmpmask);
                
                // Store the vectors to their appropriate positions.
                vec_st(va_first, 0, cur);
                vec_st(va_second, 16, cur);
            }
        }
        // Call our extended merge sort on the sorted sets
        mergesort_c(a, 16, length/16);
    }
}

int vecinssort_l(uint32_t a[], int length) {
    vector uint32_t va_key, va_cur, va_next, va_first, va_second;
    vector bool char va_cmpmask;
    vector uint32_t *cur;
    int i, j, k;
    if (length >= 8) {
        // How many sets of N/16 do we have to deal with?
        loops = length/4;
        for(i = 1; i < loops; i++) {
            // Get the key set of elements
            // and load it into an Altivec vector register
            cur = a + i*4;
            va_key = vec_ld(0, cur);
            // Compare all the previous sets to the key set.
            for (j = i-1; j >= 0; j--) {
                cur = a + j*4;
                /* Load current and next 16-byte sets 
                into Altivec registers
                */
                va_cur = vec_ld(0, cur);
                va_next = vec_ld(16, cur);
                
                /* Generate the comparison mask between 
                the current vector and the key vector
                */
                va_cmpmask = vec_cmpgt(va_cur, va_key);
                
                /* And construct the first and second 
                result vectors to replace the 2 16-byte sets
                */
                va_first = vec_sel(va_cur, va_key, va_cmpmask);
                va_second = vec_sel(va_next, va_cur, va_cmpmask);
                
                // Store the vectors to their appropriate positions.
                vec_st(va_first, 0, cur);
                vec_st(va_second, 16, cur);
            }
        }
        // Call our extended merge sort on the sorted sets
        mergesort_c(a, 4, length/4);
    }
}