/***************************************************************************
 *   Copyright (C) 2005 by Konstantinos Margaritis                         *
 *   markos@debian.gr                                                      *
 *                                                                         *
 *   The ham_func2(), ham_func3() and ham_func4() are redistributed        *
 *   according to the licenses in the Berkeley DB 4.x. (hash/hash_func.c)  *
 *   All other functions are dual-licensed as GPL/BSD. Choose whichever    *
 *   seems appropriate.
 ***************************************************************************/

#include <sys/types.h>
#include <stdint.h>
#include <altivec.h>

#define DCHARHASH(h, c) ((h) = 0x63c63cd9*(h) + 0x9c39c33d + (c))

/* Original version of algorithm: h = 0x63c63cd9*(h) + 0x9c39c33d + (c)
*/

uint32_t
ham_func2(key, len)
        const void *key;
        uint32_t len;
{
        const uint8_t *e, *k;
        uint32_t h;
        uint8_t c;

        k = key;
        e = k + len;
        for (h = 0; k != e;) {
                c = *k++;
                if (!c && k > e)
                        break;
                DCHARHASH(h, c);
//              printf("h = %08x\n", h);
        }
        return (h);
}

/*
 * __ham_func3 --
 *      Ozan Yigit's original sdbm hash.
 *
 * Ugly, but fast.  Break the string up into 8 byte units.  On the first time
 * through the loop get the "leftover bytes" (strlen % 8).  On every other
 * iteration, perform 8 HASHC's so we handle all 8 bytes.  Essentially, this
 * saves us 7 cmp & branch instructions.
 *
 * PUBLIC: u_int32_t __ham_func3 __P((DB *, const void *, u_int32_t));
 */
u_int32_t
ham_func3(key, len)
        const void *key;
        u_int32_t len;
{
        const u_int8_t *k;
        u_int32_t n, loop;

        if (len == 0)
                return (0);

#define HASHC   n = *k++ + 65599 * n
        n = 0;
        k = key;

        loop = (len + 8 - 1) >> 3;
        switch (len & (8 - 1)) {
        case 0:
                do {
                        HASHC;
        case 7:
                        HASHC;
        case 6:
                        HASHC;
        case 5:
                        HASHC;
        case 4:
                        HASHC;
        case 3:
                        HASHC;
        case 2:
                        HASHC;
        case 1:
                        HASHC;
                } while (--loop);
        }
        return (n);
}

/*
 * __ham_func4 --
 *      Chris Torek's hash function.  Although this function performs only
 *      slightly worse than __ham_func5 on strings, it performs horribly on
 *      numbers.
 *
 * PUBLIC: u_int32_t __ham_func4 __P((DB *, const void *, u_int32_t));
 */
u_int32_t
ham_func4(key, len)
        const void *key;
        u_int32_t len;
{
        const u_int8_t *k;
        u_int32_t h, loop;
        
        if (len == 0)
                return (0);

#define HASH4a  h = (h << 5) - h + *k++;
#define HASH4b  h = (h << 5) + h + *k++;
#define HASH4   HASH4b
        h = 0;
        k = key;

        loop = (len + 8 - 1) >> 3;
        switch (len & (8 - 1)) {
        case 0:
                do {
                        HASH4;
        case 7:
                        HASH4;
        case 6:
                        HASH4;
        case 5:
                        HASH4;
        case 4:
                        HASH4;
        case 3:
                        HASH4;
        case 2:
                        HASH4;
        case 1:
                        HASH4;
                } while (--loop);
        }
        return (h);
}


/* 1st Vectorized version of algorithm: h = 0x63c63cd9*(h) + 0x9c39c33d + (c)
*/

uint32_t
ham_func2_vec(key, len)
        const void *key;
        uint32_t len;
{
        const uint8_t *e, *k;
        uint32_t h = 0, h0 = 0, i;
        uint32_t a[17] = {      1, 0x63c63cd9, 0x04226ff1, 0x5ad35f49,
                                0x74f6e0e1, 0xf2ff5ab9, 0x89cd42d1, 0x6b4c9f29,
                                0xddf685c1, 0x03969c99, 0xa0b499b1, 0xdf01c309,
                                0xc62a6ea1, 0xbf6b8279, 0x294bf491, 0x09da4ae9,
                                0x8ece1b81 };
        uint32_t absum[16] = {  0x9c39c33d, 0x2b208df2, 0x2b42cd5f, 0x84761cc4,
                                0x9eae1561, 0xf6d29e76, 0x6329bd43, 0xbdc8e508,
                                0xcd61c705, 0x7b9ea27a, 0x6c2214a7, 0xd01e68cc,
                                0x18586829, 0x6249a9fe, 0xe4f6638b, 0x2eeab810 };
        uint8_t c;

        k = key;
        e = k + len;
        
        int offset = (uint32_t) k % 16;
        if (offset > 0)
                h = absum[offset];
        while (offset--) {
                c = *k++;
                h += a[--len]*c;
        }
        h0 = h;

        while (len >= 16) {
                h = a[16]*h0 +absum[15];
                for (i=16; i > 0 ; i-- ) {
                        c = *k++;
                        h += a[i-1]*c;
                }
                len -= 16;
                h0 = h;
        }

        if (len) {
                h = a[len]*h0 + absum[len-1];
                for (i=len; i > 0 ; i-- ) {
                        c = *k++;
                        h += a[i-1]*c;
                }
        }

        return (h);
}

/* 2nd Vectorized version of algorithm: h = 0x63c63cd9*(h) + 0x9c39c33d + (c)
*/

uint32_t
ham_func2_vec2(key, len)
        const void *key;
        uint32_t len;
{
        const uint8_t *k = key;
        int i;
        uint32_t h = 0, h0 = 0, h2 = 0;
        uint32_t h_hi = 0, h_lo = 0;
        uint16_t ahi[17] = {    0, 0x63c6, 0x0422, 0x5ad3, 0x74f6, 0xf2ff, 0x89cd, 0x6b4c, 
                                0xddf6, 0x0396, 0xa0b4, 0xdf01,0xc62a, 0xbf6b, 0x294b, 0x09da,
                                0x8ece };
        uint16_t alo[17] = {    1, 0x3cd9, 0x6ff1, 0x5f49, 0xe0e1, 0x5ab9, 0x42d1, 0x9f29,
                                0x85c1, 0x9c99, 0x99b1, 0xc309, 0x6ea1, 0x8279, 0xf491, 0x4ae9,
                                0x1b81 };
        uint32_t absum[16] = {  0x9c39c33d, 0x2b208df2, 0x2b42cd5f, 0x84761cc4,
                                0x9eae1561, 0xf6d29e76, 0x6329bd43, 0xbdc8e508,
                                0xcd61c705, 0x7b9ea27a, 0x6c2214a7, 0xd01e68cc,
                                0x18586829, 0x6249a9fe, 0xe4f6638b, 0x2eeab810 };
        uint8_t c;

        int offset = (uint32_t) k % 16;
        if (offset > 0) {
                h = absum[offset];
                len -= offset;
        }
        while (offset--) {
                c = *k++;
                h += (((uint32_t)(ahi[offset])*c) << 16) + ((uint32_t)(alo[offset])*c);
                h0 = h;
        }

        if (len >= 16) {
            while (len >= 16) {
                    
                    h_hi = 0;
                    h_lo = 0;
                    h =  h_hi + h_lo;
                    h2 = h;
                    for (i=15; i >= 0 ; i-- ) {
                        c = *k++;
                        h_hi += ((uint32_t)(ahi[i])*c);
                        h_lo += (uint32_t)(alo[i])*c;
                        h2 += (((uint32_t)(ahi[i])*c) << 16) + ((uint32_t)(alo[i])*c);
                        h += h_hi + h_lo;
                    }
                    h_hi += (uint32_t)(ahi[16])*h0;
                    h_hi = h_hi << 16;
                    h_lo += (uint32_t)(alo[16])*h0;
                    h = absum[15] + h_hi + h_lo;
                    len -= 16;
                    h0 = h;
            }
        }
        
        if (len) {
                h_hi = 0;
                h_lo = 0;
                h =  h_hi + h_lo;
                h2 = h;
                for (i=len-1; i >= 0 ; i-- ) {
                        c = *k++;
                        h_hi += (((uint32_t)(ahi[i])*c) << 16);
                        h_lo += (uint32_t)(alo[i])*c;
                        h2 += (((uint32_t)(ahi[i])*c) << 16) + ((uint32_t)(alo[i])*c);
                        h += h_hi + h_lo;
                }
                h_hi += (uint32_t)(ahi[len])*h0 << 16;
                h_lo += (uint32_t)(alo[len])*h0;
                h = absum[len-1] + h_hi + h_lo;
        }

        return (h);
}

void printbuf16(char *label, unsigned char *buf) {
    int i;
    printf("%s : ", label);
    for (i=0; i < 16; i++) {
        printf("%02x ", buf[i]);
    }
    printf("\n");
}

void printvec_char(char *label, vector unsigned char vc) {
    int i;
    union {
        vector unsigned char v; 
        unsigned char c[16]; 
    } vt_char;
    
    vt_char.v = vc;
    printf("%s : ", label);
    for (i=0; i < 16; i++) {
        printf("%2x ", vt_char.c[i]);
    }
    printf("\n");
}

void printvec_short(char *label, vector unsigned short vs) {
    int i;
    union {
        vector unsigned short v; 
        unsigned short s[8]; 
    } vt_short;
    
    vt_short.v = vs;
    printf("%s : ", label);
    for (i=0; i < 8; i++) {
        printf("%04x ", vt_short.s[i]);
    }
    printf("\n");
}

void printvec_int(char *label, vector unsigned int vl) {
    int i;
    union {
        vector unsigned int v; 
        unsigned int l[4]; 
    } vt_int;
    
    vt_int.v = vl;
    printf("%s : ", label);
    for (i=0; i < 4; i++) {
        printf("%08x ", vt_int.l[i]);
    }
    printf("\n");
}


/* Altivec version of algorithm: h = 0x63c63cd9*(h) + 0x9c39c33d + (c)
   Up to 7.5x faster.
*/

uint32_t
ham_func2_vec3(key, len)
        const void *key;
        uint32_t len;
{
        const uint8_t *buf = key;
        
        uint32_t sum15, h0 = 0, h = 0;
        uint32_t a[17] = {      1, 0x63c63cd9, 0x04226ff1, 0x5ad35f49,
                                0x74f6e0e1, 0xf2ff5ab9, 0x89cd42d1, 0x6b4c9f29,
                                0xddf685c1, 0x03969c99, 0xa0b499b1, 0xdf01c309,
                                0xc62a6ea1, 0xbf6b8279, 0x294bf491, 0x09da4ae9,
                                0x8ece1b81 };
        uint32_t basum[16] = {  0x9c39c33d, 0x2b208df2, 0x2b42cd5f, 0x84761cc4,
                                0x9eae1561, 0xf6d29e76, 0x6329bd43, 0xbdc8e508,
                                0xcd61c705, 0x7b9ea27a, 0x6c2214a7, 0xd01e68cc,
                                0x18586829, 0x6249a9fe, 0xe4f6638b, 0x2eeab810 };
        uint8_t c;

        int offset = (uint32_t) buf % 16;
        if (offset > 0) {
                h = basum[offset];
                len -= offset;
        }
        while (offset--) {
                c = *buf++;
                h += a[--len]*c;
                h0 = h;
        }

        if (len >= 16) {
            uint32_t __attribute__ ((aligned(16))) s15[4];
            /* 
               We have to reverse the order of these:
               D_hi:    0, 0x63c6, 0x0422, 0x5ad3, 0x74f6, 0xf2ff, 0x89cd, 0x6b4c, 
                        0xddf6, 0x0396, 0xa0b4, 0xdf01,0xc62a, 0xbf6b, 0x294b, 0x09da
               D_lo:    1, 0x3cd9, 0x6ff1, 0x5f49, 0xe0e1, 0x5ab9, 0x42d1, 0x9f29,
                        0x85c1, 0x9c99, 0x99b1, 0xc309, 0x6ea1, 0x8279, 0xf491, 0x4ae9
            */
            vector uint16_t D_hi_0 = { 0x09da, 0x294b, 0xbf6b, 0xc62a, 0xdf01, 0xa0b4, 0x0396, 0xddf6 };
            vector uint16_t D_hi_1 = { 0x6b4c, 0x89cd, 0xf2ff, 0x74f6, 0x5ad3, 0x0422, 0x63c6, 0x0000 };
            
            vector uint16_t D_lo_0 = { 0x4ae9, 0xf491, 0x8279, 0x6ea1, 0xc309, 0x99b1, 0x9c99, 0x85c1 };
            vector uint16_t D_lo_1 = { 0x9f29, 0x42d1, 0x5ab9, 0xe0e1, 0x5f49, 0x6ff1, 0x3cd9, 0x0001 };
            
            vector uint32_t vs1, vs2, vs3, vs4, vs5, vs6, vs7, vs8, vs9, vs10;
            vector uint8_t vx1, vx2;
            vector uint16_t vy1_hi, vy1_lo, vy2_hi, vy2_lo;
            vector uint32_t shift16 = vec_splat_u32(-16);
            vector uint32_t zero = vec_splat_u32(0);
            
            while (len >= 32) {
                    vx1 = vec_ld(0, buf);
                    vx2 = vec_ld(16, buf);

                    vy1_hi = (vector uint16_t) vec_mergeh((vector uint8_t)zero, vx1);
                    vy1_lo = (vector uint16_t) vec_mergel((vector uint8_t)zero, vx1);
                    vy2_hi = (vector uint16_t) vec_mergeh((vector uint8_t)zero, vx2);
                    vy2_lo = (vector uint16_t) vec_mergel((vector uint8_t)zero, vx2);
         
                    vs1 = vec_msum( D_hi_0, vy1_hi, zero );
                    vs2 = vec_msum( D_hi_1, vy1_lo, vs1 );
                    
                    vs3 = vec_sl( vs2, shift16 );
                    vs4 = vec_msum( D_lo_0, vy1_hi, vs3 );
                    vs5 = vec_msum( D_lo_1, vy1_lo, vs4 );
                    
                    vs6 = vec_msum( D_hi_0, vy2_hi, zero );
                    vs7 = vec_msum( D_hi_1, vy2_lo, vs6 );
                    vs8 = vec_sl( vs7, shift16 );
                    vs9 = vec_msum( D_lo_0, vy2_hi, vs8 );
                    vs10 = vec_msum( D_lo_1, vy2_lo, vs9 );
                    
                    vec_st(vs5, 0, &s15[0]);
                    sum15 = s15[0] + s15[1] + s15[2] + s15[3];
                    h = a[16]*h0 + basum[15] + sum15;
                    h0 = h;
                    vec_st(vs10, 0, &s15[0]);
                    sum15 = s15[0] + s15[1] + s15[2] + s15[3];
                    h = a[16]*h0 + basum[15] + sum15;
                    h0 = h;
                    buf += 32;
                    len -= 32;
            }
            
            if (len >= 16) {
                vx1 = vec_ld(0, buf);
                vy1_hi = (vector uint16_t) vec_mergeh((vector uint8_t)zero, vx1);
                vy1_lo = (vector uint16_t) vec_mergel((vector uint8_t)zero, vx1);
                
                vs1 = vec_msum( D_hi_0, vy1_hi, zero );
                vs2 = vec_msum( D_hi_1, vy1_lo, vs1 );
                vs3 = vec_sl( vs2, shift16 );
                
                vs4 = vec_msum( D_lo_0, vy1_hi, vs3);
                vs5 = vec_msum( D_lo_1, vy1_lo, vs4 );
                
                vec_st(vs5, 0, &s15[0]);
                sum15 = s15[0] + s15[1] + s15[2] + s15[3];
                h = a[16]*h0 + basum[15] + sum15;
                buf += 16;
                len -= 16;
                h0 = h;
            }
        }

        if (len >= 8) {
            uint32_t __attribute__ ((aligned(16))) s15[4];
            vector uint16_t D_hi = { 0x6b4c, 0x89cd, 0xf2ff, 0x74f6, 0x5ad3, 0x0422, 0x63c6, 0x0000 };
            vector uint16_t D_lo = { 0x9f29, 0x42d1, 0x5ab9, 0xe0e1, 0x5f49, 0x6ff1, 0x3cd9, 0x0001 };
            
            vector uint32_t vs1, vs2, vs3;
            vector uint8_t vx;
            vector uint16_t vy;
            vector uint32_t shift16 = vec_splat_u32(-16);
            vector uint32_t zero = vec_splat_u32(0);
            vx = vec_ld(0, buf);
            vy = (vector uint16_t) vec_mergeh((vector uint8_t)zero, vx);
                    
            vs1 = vec_msum( D_hi, vy, zero );
            vs2 = vec_sl( vs1, shift16 );
            vs3 = vec_msum( D_lo, vy, vs2 );
            
            vec_st(vs3, 0, &s15[0]);
            sum15 = s15[0] + s15[1] + s15[2] + s15[3];
            h = a[8]*h0 + basum[7] + sum15;
                    
            buf += 8;
            len -= 8;
            h0 = h;
        }
        
        if (len) {
            h = a[len]*h0 + basum[len-1];
            switch (len) {
                case 7:
                    h += a[6]*buf[0] + a[5]*buf[1] +a[4]*buf[2] + a[3]*buf[3] + a[2]*buf[4] + a[1]*buf[5] + a[0]*buf[6];
                    break;
                case 6:
                    h += a[5]*buf[0] + a[4]*buf[1] +a[3]*buf[2] + a[2]*buf[3] + a[1]*buf[4] + a[0]*buf[5];
                    break;
                case 5:
                    h += a[4]*buf[0] + a[3]*buf[1] +a[2]*buf[2] + a[1]*buf[3] + a[0]*buf[4];
                    break;
                case 4:
                    h += a[3]*buf[0] + a[2]*buf[1] +a[1]*buf[2] + a[0]*buf[3];
                    break;
                case 3:
                    h += a[2]*buf[0] + a[1]*buf[1] +a[0]*buf[2];
                    break;
                case 2:
                    h += a[1]*buf[0] + a[0]*buf[1];
                    break;
                case 1:
                    h += a[0]*buf[0];
                    break;
            }
        }

        return (h);
}

/* Altivec version of algorithm: h = 0x63c63cd9*(h) + 0x9c39c33d + (c)
   Up to 3.5x faster.
*/

uint32_t
ham_func3_vec(key, len)
        const void *key;
        uint32_t len;
{
        const uint8_t *buf = key;
        uint32_t __attribute__ ((aligned(16))) s15[4];
        uint32_t sum15, h0 = 0, h = 0;
        uint32_t a[17] = {  0x00000001, 0x0001003F, 0x007e0f81, 0x2e86d0bf, 
                            0x43ec5f01, 0x162c613f, 0xd62aee81, 0xa311b1bf,
                            0xd319be01, 0xb156c23f, 0x6698cd81, 0x0d1b92bf,
                            0xcc881d01, 0x7280233f, 0x50c7ac81, 0x8da473bf,
                            0x4f377c01
                         };
        uint8_t c;

        int offset = (uint32_t) buf % 16;
        if (offset > 0) {
                //printf("offset = %x\n", offset);
                len -= offset;
        }
        while (offset--) {
                c = *buf++;
                h += a[--len]*c;
                //printf("1: h = %08x\n", h);
                h0 = h;
        }

        //printf("vector: 1.1: h0 = %08x, len = %d\n", h0, len);
        if (len >= 16) {
            /* 
               We have to reverse the order of these:
               D_hi:    0x0000, 0x0001, 0x007e, 0x2e86, 0x43ec, 0x162c, 0xd62a, 0xa311,
                        0xd319, 0xb156, 0x6698, 0x0d1b, 0xcc88, 0x7280, 0x50c7, 0x8da4,
               D_lo:    0x0001, 0x003F, 0x0f81, 0xd0bf, 0x5f01, 0x613f, 0xee81, 0xb1bf,
                        0xbe01, 0xc23f, 0xcd81, 0x92bf, 0x1d01, 0x233f, 0xac81, 0x73bf
            */
            vector uint16_t D_hi_0 = { 0x8da4, 0x50c7, 0x7280, 0xcc88, 0x0d1b, 0x6698, 0xb156, 0xd319 };
            vector uint16_t D_hi_1 = { 0xa311, 0xd62a, 0x162c, 0x43ec, 0x2e86, 0x007e, 0x0001, 0x0000 };
            
            vector uint16_t D_lo_0 = { 0x73bf, 0xac81, 0x233f, 0x1d01, 0x92bf, 0xcd81, 0xc23f, 0xbe01 };
            vector uint16_t D_lo_1 = { 0xb1bf, 0xee81, 0x613f, 0x5f01, 0xd0bf, 0x0f81, 0x003f, 0x0001 };
            
            vector uint32_t vs1, vs2, vs3, vs4, vs5, vs6, vs7, vs8, vs9, vs10;
            vector uint8_t vx1, vx2;
            vector uint16_t vy1_hi, vy1_lo, vy2_hi, vy2_lo;
            vector uint32_t shift16 = vec_splat_u32(-16);
            vector uint32_t zero = vec_splat_u32(0);
            
            while (len >= 32) {
                    vx1 = vec_ld(0, buf);
                    vx2 = vec_ld(16, buf);

                    vy1_hi = (vector uint16_t) vec_mergeh((vector uint8_t)zero, vx1);
                    vy1_lo = (vector uint16_t) vec_mergel((vector uint8_t)zero, vx1);
                    vy2_hi = (vector uint16_t) vec_mergeh((vector uint8_t)zero, vx2);
                    vy2_lo = (vector uint16_t) vec_mergel((vector uint8_t)zero, vx2);
         
                    vs1 = vec_msum( D_hi_0, vy1_hi, zero );
                    vs2 = vec_msum( D_hi_1, vy1_lo, vs1 );
                    
                    vs3 = vec_sl( vs2, shift16 );
                    vs4 = vec_msum( D_lo_0, vy1_hi, vs3 );
                    vs5 = vec_msum( D_lo_1, vy1_lo, vs4 );
                    
                    vs6 = vec_msum( D_hi_0, vy2_hi, zero );
                    vs7 = vec_msum( D_hi_1, vy2_lo, vs6 );
                    vs8 = vec_sl( vs7, shift16 );
                    vs9 = vec_msum( D_lo_0, vy2_hi, vs8 );
                    vs10 = vec_msum( D_lo_1, vy2_lo, vs9 );
                    
                    vec_st(vs5, 0, &s15[0]);
                    sum15 = s15[0] + s15[1] + s15[2] + s15[3];
                    h = a[16]*h0 + sum15;
                    h0 = h;
                    vec_st(vs10, 0, &s15[0]);
                    sum15 = s15[0] + s15[1] + s15[2] + s15[3];
                    h = a[16]*h0 + sum15;
                    h0 = h;
                    buf += 32;
                    len -= 32;
            }
            
            if (len >= 16) {
                vx1 = vec_ld(0, buf);
                vy1_hi = (vector uint16_t) vec_mergeh((vector uint8_t)zero, vx1);
                vy1_lo = (vector uint16_t) vec_mergel((vector uint8_t)zero, vx1);
                
                vs1 = vec_msum( D_hi_0, vy1_hi, zero );
                vs2 = vec_msum( D_hi_1, vy1_lo, vs1 );
                vs3 = vec_sl( vs2, shift16 );
                
                vs4 = vec_msum( D_lo_0, vy1_hi, vs3);
                vs5 = vec_msum( D_lo_1, vy1_lo, vs4 );
                
                vec_st(vs5, 0, &s15[0]);
                sum15 = s15[0] + s15[1] + s15[2] + s15[3];
                h = a[16]*h0 + sum15;
                buf += 16;
                len -= 16;
                h0 = h;
            }
        }

        if (len >= 8) {
            vector uint16_t D_hi = { 0xa311, 0xd62a, 0x162c, 0x43ec, 0x2e86, 0x007e, 0x0001, 0x0000 };
            vector uint16_t D_lo = { 0xb1bf, 0xee81, 0x613f, 0x5f01, 0xd0bf, 0x0f81, 0x003f, 0x0001 };
            
            vector uint32_t vs1, vs2, vs3;
            vector uint8_t vx;
            vector uint16_t vy;
            vector uint32_t shift16 = vec_splat_u32(-16);
            vector uint32_t zero = vec_splat_u32(0);
            vx = vec_ld(0, buf);
            vy = (vector uint16_t) vec_mergeh((vector uint8_t)zero, vx);
                    
            vs1 = vec_msum( D_hi, vy, zero );
            vs2 = vec_sl( vs1, shift16 );
            vs3 = vec_msum( D_lo, vy, vs2 );
            
            vec_st(vs3, 0, &s15[0]);
            sum15 = s15[0] + s15[1] + s15[2] + s15[3];
            h = a[8]*h0 + sum15;
                    
            buf += 8;
            len -= 8;
            h0 = h;
        }
        
        if (len) {
            h = a[len]*h0;
            switch (len) {
                case 7:
                    h += a[6]*buf[0] + a[5]*buf[1] +a[4]*buf[2] + a[3]*buf[3] + a[2]*buf[4] + a[1]*buf[5] + a[0]*buf[6];
                    break;
                case 6:
                    h += a[5]*buf[0] + a[4]*buf[1] +a[3]*buf[2] + a[2]*buf[3] + a[1]*buf[4] + a[0]*buf[5];
                    break;
                case 5:
                    h += a[4]*buf[0] + a[3]*buf[1] +a[2]*buf[2] + a[1]*buf[3] + a[0]*buf[4];
                    break;
                case 4:
                    h += a[3]*buf[0] + a[2]*buf[1] +a[1]*buf[2] + a[0]*buf[3];
                    break;
                case 3:
                    h += a[2]*buf[0] + a[1]*buf[1] +a[0]*buf[2];
                    break;
                case 2:
                    h += a[1]*buf[0] + a[0]*buf[1];
                    break;
                case 1:
                    h += a[0]*buf[0];
                    break;
            }
        }

        return (h);
}

/* Altivec version of algorithm: h = 0x63c63cd9*(h) + 0x9c39c33d + (c)
   Up to 2.2x faster.
*/

uint32_t
ham_func4_vec(key, len)
        const void *key;
        uint32_t len;
{
        const uint8_t *buf = key;
        uint32_t __attribute__ ((aligned(16))) s15[4];
        uint32_t sum15, h0 = 0, h = 0;
        uint32_t a[17] = {  0x00000001, 0x00000021, 0x00000441, 0x00008c61, 
                            0x00121881, 0x025528a1, 0x4cfa3cc1, 0xec41d4e1,
                            0x747c7101, 0x040a9121, 0x855cb541, 0x30f35d61,
                            0x4f5f0981, 0x3b4039a1, 0xa3476dc1, 0x0c3525e1,
                            0x92d9e201
                         };
        uint8_t c;

        int offset = (uint32_t) buf % 16;
        if (offset > 0) {
                //printf("offset = %x\n", offset);
                len -= offset;
        }
        while (offset--) {
                c = *buf++;
                h += a[--len]*c;
                //printf("1: h = %08x\n", h);
                h0 = h;
        }

        //printf("vector: 1.1: h0 = %08x, len = %d\n", h0, len);
        if (len >= 16) {
            /* 
               We have to reverse the order of these:
               D_hi:    0x0000, 0x0000, 0x0000, 0x0000, 0x0012, 0x0255, 0x4cfa, 0xec41,
                        0x747c, 0x040a, 0x855c, 0x30f3, 0x4f5f, 0x3b40, 0xa347, 0x0c35

               D_lo:    0x0001, 0x0021, 0x0441, 0x8c61, 0x1881, 0x28a1, 0x3cc1, 0xd4e1,
                        0x7101, 0x9121, 0xb541, 0x5d61, 0x0981, 0x39a1, 0x6dc1, 0x25e1
            */
            vector uint16_t D_hi_0 = { 0x0c35, 0xa347, 0x3b40, 0x4f5f, 0x30f3, 0x855c, 0x040a, 0x747c };
            vector uint16_t D_hi_1 = { 0xec41, 0x4cfa, 0x0255, 0x0012, 0x0000, 0x0000, 0x0000, 0x0000 };
            
            vector uint16_t D_lo_0 = { 0x25e1, 0x6dc1, 0x39a1, 0x0981, 0x5d61, 0xb541, 0x9121, 0x7101 };
            vector uint16_t D_lo_1 = { 0xd4e1, 0x3cc1, 0x28a1, 0x1881, 0x8c61, 0x0441, 0x0021, 0x0001 };
            
            vector uint32_t vs1, vs2, vs3, vs4, vs5, vs6, vs7, vs8, vs9, vs10;
            vector uint8_t vx1, vx2;
            vector uint16_t vy1_hi, vy1_lo, vy2_hi, vy2_lo;
            vector uint32_t shift16 = vec_splat_u32(-16);
            vector uint32_t zero = vec_splat_u32(0);
            
            while (len >= 32) {
                    vx1 = vec_ld(0, buf);
                    vx2 = vec_ld(16, buf);

                    vy1_hi = (vector uint16_t) vec_mergeh((vector uint8_t)zero, vx1);
                    vy1_lo = (vector uint16_t) vec_mergel((vector uint8_t)zero, vx1);
                    vy2_hi = (vector uint16_t) vec_mergeh((vector uint8_t)zero, vx2);
                    vy2_lo = (vector uint16_t) vec_mergel((vector uint8_t)zero, vx2);
         
                    vs1 = vec_msum( D_hi_0, vy1_hi, zero );
                    vs2 = vec_msum( D_hi_1, vy1_lo, vs1 );
                    
                    vs3 = vec_sl( vs2, shift16 );
                    vs4 = vec_msum( D_lo_0, vy1_hi, vs3 );
                    vs5 = vec_msum( D_lo_1, vy1_lo, vs4 );
                    
                    vs6 = vec_msum( D_hi_0, vy2_hi, zero );
                    vs7 = vec_msum( D_hi_1, vy2_lo, vs6 );
                    vs8 = vec_sl( vs7, shift16 );
                    vs9 = vec_msum( D_lo_0, vy2_hi, vs8 );
                    vs10 = vec_msum( D_lo_1, vy2_lo, vs9 );
                    
                    vec_st(vs5, 0, &s15[0]);
                    sum15 = s15[0] + s15[1] + s15[2] + s15[3];
                    h = a[16]*h0 + sum15;
                    h0 = h;
                    vec_st(vs10, 0, &s15[0]);
                    sum15 = s15[0] + s15[1] + s15[2] + s15[3];
                    h = a[16]*h0 + sum15;
                    h0 = h;
                    buf += 32;
                    len -= 32;
            }
            
            if (len >= 16) {
                vx1 = vec_ld(0, buf);
                vy1_hi = (vector uint16_t) vec_mergeh((vector uint8_t)zero, vx1);
                vy1_lo = (vector uint16_t) vec_mergel((vector uint8_t)zero, vx1);
                
                vs1 = vec_msum( D_hi_0, vy1_hi, zero );
                vs2 = vec_msum( D_hi_1, vy1_lo, vs1 );
                vs3 = vec_sl( vs2, shift16 );
                
                vs4 = vec_msum( D_lo_0, vy1_hi, vs3);
                vs5 = vec_msum( D_lo_1, vy1_lo, vs4 );
                
                vec_st(vs5, 0, &s15[0]);
                sum15 = s15[0] + s15[1] + s15[2] + s15[3];
                h = a[16]*h0 + sum15;
                buf += 16;
                len -= 16;
                h0 = h;
            }
        }

        if (len >= 8) {
            vector uint16_t D_hi = { 0xec41, 0x4cfa, 0x0255, 0x0012, 0x0000, 0x0000, 0x0000, 0x0000 };
            vector uint16_t D_lo = { 0xd4e1, 0x3cc1, 0x28a1, 0x1881, 0x8c61, 0x0441, 0x0021, 0x0001 };
            
            vector uint32_t vs1, vs2, vs3;
            vector uint8_t vx;
            vector uint16_t vy;
            vector uint32_t shift16 = vec_splat_u32(-16);
            vector uint32_t zero = vec_splat_u32(0);
            vx = vec_ld(0, buf);
            vy = (vector uint16_t) vec_mergeh((vector uint8_t)zero, vx);
                    
            vs1 = vec_msum( D_hi, vy, zero );
            vs2 = vec_sl( vs1, shift16 );
            vs3 = vec_msum( D_lo, vy, vs2 );
            
            vec_st(vs3, 0, &s15[0]);
            sum15 = s15[0] + s15[1] + s15[2] + s15[3];
            h = a[8]*h0 + sum15;
                    
            buf += 8;
            len -= 8;
            h0 = h;
        }
        
        if (len) {
            h = a[len]*h0;
            switch (len) {
                case 7:
                    h += a[6]*buf[0] + a[5]*buf[1] +a[4]*buf[2] + a[3]*buf[3] + a[2]*buf[4] + a[1]*buf[5] + a[0]*buf[6];
                    break;
                case 6:
                    h += a[5]*buf[0] + a[4]*buf[1] +a[3]*buf[2] + a[2]*buf[3] + a[1]*buf[4] + a[0]*buf[5];
                    break;
                case 5:
                    h += a[4]*buf[0] + a[3]*buf[1] +a[2]*buf[2] + a[1]*buf[3] + a[0]*buf[4];
                    break;
                case 4:
                    h += a[3]*buf[0] + a[2]*buf[1] +a[1]*buf[2] + a[0]*buf[3];
                    break;
                case 3:
                    h += a[2]*buf[0] + a[1]*buf[1] +a[0]*buf[2];
                    break;
                case 2:
                    h += a[1]*buf[0] + a[0]*buf[1];
                    break;
                case 1:
                    h += a[0]*buf[0];
                    break;
            }
        }

        return (h);
}
