/* adler32.c -- compute the Adler-32 checksum of a data stream
 * Copyright (C) 1995-2003 Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 */

/* @(#) $Id$ */

#define ZLIB_INTERNAL
#include "zlib.h"

#define BASE 65521UL    /* largest prime smaller than 65536 */
#define NMAX 5552
#define NMAX_VEC 3854
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
/* NMAX_VEC is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^31-1 */

#define DO1(buf,i)  {s1 += buf[i]; s2 += s1;}
#define DO2(buf,i)  DO1(buf,i); DO1(buf,i+1);
#define DO4(buf,i)  DO2(buf,i); DO2(buf,i+2);
#define DO8(buf,i)  DO4(buf,i); DO4(buf,i+4);
#define DO16(buf)   DO8(buf,0); DO8(buf,8);

#ifdef NO_DIVIDE
#  define MOD(a) \
    do { \
        if (a >= (BASE << 16)) a -= (BASE << 16); \
        if (a >= (BASE << 15)) a -= (BASE << 15); \
        if (a >= (BASE << 14)) a -= (BASE << 14); \
        if (a >= (BASE << 13)) a -= (BASE << 13); \
        if (a >= (BASE << 12)) a -= (BASE << 12); \
        if (a >= (BASE << 11)) a -= (BASE << 11); \
        if (a >= (BASE << 10)) a -= (BASE << 10); \
        if (a >= (BASE << 9)) a -= (BASE << 9); \
        if (a >= (BASE << 8)) a -= (BASE << 8); \
        if (a >= (BASE << 7)) a -= (BASE << 7); \
        if (a >= (BASE << 6)) a -= (BASE << 6); \
        if (a >= (BASE << 5)) a -= (BASE << 5); \
        if (a >= (BASE << 4)) a -= (BASE << 4); \
        if (a >= (BASE << 3)) a -= (BASE << 3); \
        if (a >= (BASE << 2)) a -= (BASE << 2); \
        if (a >= (BASE << 1)) a -= (BASE << 1); \
        if (a >= BASE) a -= BASE; \
    } while (0)
#else
#  define MOD(a) a %= BASE
#endif

/* ========================================================================= */
//#ifndef HAS_ALTIVEC_H
uLong ZEXPORT adler32(adler, buf, len)
    uLong adler;
    const Bytef *buf;
    uInt len;
{
    unsigned long s1 = adler & 0xffff;
    unsigned long s2 = (adler >> 16) & 0xffff;
    int k;

    if (buf == Z_NULL) return 1L;

    while (len > 0) {
        k = len < NMAX ? (int)len : NMAX;
        len -= k;
        while (k >= 16) {
            DO16(buf);
            buf += 16;
            k -= 16;
        }
        if (k != 0) do {
            s1 += *buf++;
            s2 += s1;
        } while (--k);
        MOD(s1);
        MOD(s2);
    }
    return (s2 << 16) | s1;
}
//#else

#include <altivec.h>

#define S1  s1[3]
#define S2  s2[3]

uLong ZEXPORT adler32_vec(adler, buf, len)
    uLong adler;
    const Bytef *buf;
    uInt len;
{
    unsigned long __attribute__ ((aligned(16))) s1[4], s2[4];
    S1 = adler & 0xffff;
    S2 = (adler >> 16) & 0xffff;
    int k, i, offset;
    
    if (buf == Z_NULL) return 1L;

    // Handle small sizes
    if (len < 16) {
        while (len--) {
            S1 += *buf++;
            S2 += S1;
        }
        MOD(S1);
        MOD(S2);

        return (S2 << 16) | S1;
    }
    
    if (len >= 16) {
        vector unsigned int v0 = vec_splat_u32(0);
        vector signed int vs1, vs2, vsum1, vsum2, vs1_0;
        vector unsigned char vbuf, v1 = vec_splat_u8(1), v2 = { 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 },
            vsh = vec_splat_u8(4);

        // Align to 16-byte boundaries
        offset = (unsigned long)(buf) % 16;
        if (offset) {
            offset = 16 -offset;
            len -= offset;
            while (offset--) {
                S1 += *buf++;
                S2 += S1;
            }
        }

        while (len >= 16) {
            vs1 = vec_lde(12, (signed int *)&s1);
            vs2 = vec_lde(12, (signed int *)&s2);
            vs1_0 = vs1;
            
            k = len < NMAX_VEC ? (int)len : NMAX_VEC;
            k -= k % 16;
            len -= k;
            while (k >= 16) {
                vbuf = (vector unsigned char)vec_ld(0, (unsigned char *)buf);
                vsum1 = (vector signed int)vec_msum(vbuf, v1, v0);
                vs1 = vec_sums(vsum1, vs1);
                vsum2 = (vector signed int)vec_msum(vbuf, v2, v0);
                vs2 = vec_sums(vsum2, vs2);
                buf += 16;
                k -= 16;
                vs1_0 = vec_sll(vs1_0, vsh);
                vs2 = vec_add(vs2, vs1_0);
                vs1_0 = vs1;
            }
            vec_ste(vs1, 12, &s1);
            vec_ste(vs2, 12, &s2);
            MOD(S1);
            MOD(S2);
        }
    }
    
    while (len--) {
        S1 += *buf++;
        S2 += S1;
    }
    MOD(S1);
    MOD(S2);

    return (S2 << 16) | S1;
}
//#endif
