#ifndef GROUP_SIZE
#define GROUP_SIZE (64)
#endif
 
#ifndef OPERATIONS
#define OPERATIONS (1)
#endif
 
////////////////////////////////////////////////////////////////////////////////////////////////////
 
#define GROUP_SIZE_2 (GROUP_SIZE + GROUP_SIZE)
#define GROUP_SIZE_3 (GROUP_SIZE_2 + GROUP_SIZE)
 
////////////////////////////////////////////////////////////////////////////////////////////////////
 
#define LOAD_GLOBAL_I4(s, i) \
    vload4((size_t)(i), (__global const int*)(s))
 
#define STORE_GLOBAL_I4(s, i, v) \
    vstore4((v), (size_t)(i), (__global int*)(s))
 
////////////////////////////////////////////////////////////////////////////////////////////////////
 
#define LOAD_LOCAL_I1(s, i) \
    ((__local const int*)(s))[(size_t)(i)]
 
#define STORE_LOCAL_I1(s, i, v) \
    ((__local int*)(s))[(size_t)(i)] = (v)
 
#define LOAD_LOCAL_I4(s, i) \
    (int4)( (LOAD_LOCAL_I1(s, i               )), \
                 (LOAD_LOCAL_I1(s, i + GROUP_SIZE  )), \
                 (LOAD_LOCAL_I1(s, i + GROUP_SIZE_2)), \
                 (LOAD_LOCAL_I1(s, i + GROUP_SIZE_3)))
 
#define STORE_LOCAL_I4(s, i, v) \
    STORE_LOCAL_I1(s, i,                (v)[0]); \
    STORE_LOCAL_I1(s, i + GROUP_SIZE,   (v)[1]); \
    STORE_LOCAL_I1(s, i + GROUP_SIZE_2, (v)[2]); \
    STORE_LOCAL_I1(s, i + GROUP_SIZE_3, (v)[3])
 
#define ACCUM_LOCAL_I4(s, i, j) \
{ \
    int4 x = LOAD_LOCAL_I4(s, i); \
    int4 y = LOAD_LOCAL_I4(s, j); \
    int4 xy = x + y; \
    STORE_LOCAL_I4(s, i, xy); \
}
////////////////////////////////////////////////////////////////////////////////////////////////////
 
__kernel void reduce(__global int4 *output, __global const int4 *input, __local int4 *shared, const unsigned int n)
{
    const int4 zero = (int4)(0.0f, 0.0f, 0.0f, 0.0f);
    const unsigned int group_id = get_global_id(0) / get_local_size(0);
    const unsigned int group_size = GROUP_SIZE;
    const unsigned int group_stride = 2 * group_size;
    const size_t local_stride = group_stride * group_size; 
    
    unsigned int op = 0;
    unsigned int last = OPERATIONS - 1;
    for(op = 0; op < OPERATIONS; op++)
    {
        const unsigned int offset = (last - op);
        const size_t local_id = get_local_id(0) + offset;
 
        STORE_LOCAL_I4(shared, local_id, zero);
        
        size_t i = group_id * group_stride + local_id; 
        while (i < n)
        {
            int4 a = LOAD_GLOBAL_I4(input, i);
            int4 b = LOAD_GLOBAL_I4(input, i + group_size);
            int4 s = LOAD_LOCAL_I4(shared, local_id);
            STORE_LOCAL_I4(shared, local_id, (a + b + s));
            i += local_stride;
        } 
    
    barrier(CLK_LOCAL_MEM_FENCE);       
    #if (GROUP_SIZE >= 512) 
        if (local_id < 256) { ACCUM_LOCAL_I4(shared, local_id, local_id + 256); }
    #endif
    
    barrier(CLK_LOCAL_MEM_FENCE);    
    #if (GROUP_SIZE >= 256) 
        if (local_id < 128) { ACCUM_LOCAL_I4(shared, local_id, local_id + 128); }
    #endif    
    
    barrier(CLK_LOCAL_MEM_FENCE);
    #if (GROUP_SIZE >= 128) 
        if (local_id <  64) { ACCUM_LOCAL_I4(shared, local_id, local_id +  64); }
    #endif
    
    barrier(CLK_LOCAL_MEM_FENCE);
    #if (GROUP_SIZE >= 64) 
        if (local_id <  32) { ACCUM_LOCAL_I4(shared, local_id, local_id +  32); }
    #endif
    
    barrier(CLK_LOCAL_MEM_FENCE);
    #if (GROUP_SIZE >= 32) 
        if (local_id <  16) { ACCUM_LOCAL_I4(shared, local_id, local_id +  16); }
    #endif
    
    barrier(CLK_LOCAL_MEM_FENCE);
    #if (GROUP_SIZE >= 16) 
        if (local_id <   8) { ACCUM_LOCAL_I4(shared, local_id, local_id +   8); } 
    #endif
    
    barrier(CLK_LOCAL_MEM_FENCE);
    #if (GROUP_SIZE >= 8) 
        if (local_id <   4) { ACCUM_LOCAL_I4(shared, local_id, local_id +   4); }
    #endif
    
    barrier(CLK_LOCAL_MEM_FENCE);
    #if (GROUP_SIZE >= 4) 
        if (local_id <   2) { ACCUM_LOCAL_I4(shared, local_id, local_id +   2); } 
    #endif
    
    barrier(CLK_LOCAL_MEM_FENCE);
    #if (GROUP_SIZE >= 2) 
        if (local_id <   1) { ACCUM_LOCAL_I4(shared, local_id, local_id +   1); } 
    #endif
 
    }
 
    barrier(CLK_LOCAL_MEM_FENCE);
    if (get_local_id(0) == 0)
    {
        int4 v = LOAD_LOCAL_I4(shared, 0);
        STORE_GLOBAL_I4(output, group_id, v);
    }        
}