#define N 100


__kernel 
void addVector(
	__global const float *in1,
	__global const float *in2,
	__global       float *out,
			 const float a
)
{
	unsigned int index = get_global_id(0);
	out[index]  = in1[index] + in2[index] * a;
	out[index] += in1[index] - in2[index] * a;
	out[index] += in1[index] * in2[index] * a;
	out[index] += in1[index] / in2[index] * a;
}

__kernel 
void addVector2(
	__global       unsigned int *out
)
{
	unsigned int index_x =  get_global_id(0);
	unsigned int index_y =  get_global_id(1);

	unsigned int width_x =  get_global_size(0);

	unsigned int index   =  index_x + index_y * width_x;

	out [index]          =  index;
//	out2[index_y]        =  index_y;
//	out[index]           += in1[index] * in2[index];
//	out[index]           += in1[index] / in2[index];
}

__kernel 
void getPI(
	__global      float *out
)
{
	unsigned int size    = get_global_size(0);
    unsigned int index   = get_global_id(0);

	float step           = 1.0 / (float)size;
	float pi=0;

	float x = ((float)index + 0.5) * step;
    out[index] = (4.0 / (1.0 + x * x)) * step;

/*
	if(!index)
	{
		for(unsigned int i=0; i<size; ++i){
			pi += out[i];
		}
		out[index]=pi;
	}
*/
}


__kernel 
void totalPI(
	__global      float *out
)
{
	unsigned int size    = get_global_size(0);
    unsigned int index   = get_global_id(0);
	out[index]=0;
	float pi;
	if(!index)
	{
		for(unsigned int i=0; i<size; ++i){
			pi += out[i];
		}
		out[index]=pi;
	}
}



__kernel void pi(
    const    int        niters,
    const    float      step_size,                                      
    __local  float*     local_sums,                                     
    __global float*     partial_sums)                                     
{
   int num_wrk_items  = get_local_size(0);                                
   int local_id       = get_local_id(0);                                  
   int group_id       = get_group_id(0);                                  
   float x, sum, accum = 0.0;                    
   int i,istart,iend;                                                     
   istart = (group_id * num_wrk_items + local_id) * niters;                
   iend   = istart+niters;                                                 
   for(i= istart; i<iend; i++){                                            
       x = (i+0.5)*step_size;                                              
       accum += 4.0/(1.0+x*x);                                             
   }
   local_sums[local_id] = accum;
   barrier(CLK_LOCAL_MEM_FENCE);
   if (local_id == 0){
      sum = 0.0;
      for(i=0; i<num_wrk_items;i++){
          sum += local_sums[i];
      }
      partial_sums[group_id] = sum;
   }
}
__kernel void pi_vec4(
    const int          niters,                                           
    const float        step_size,                                       
    __local  float*    local_sums,                                      
    __global float*    partial_sums)                                    
{                                                                      
    int num_wrk_items  = get_local_size(0);                             
    int local_id       = get_local_id(0);                              
    int group_id       = get_group_id(0);                               
    float sum, accum = 0.0,tmp;                                         

    float4 x, psum_vec;                                                  
    float4 ramp={0.5, 1.5, 2.5, 3.5};                                   
    float4 four={4.0, 4.0, 4.0, 4.0};                                  
    float4 one ={1.0, 1.0, 1.0, 1.0};                                  

    int i,istart,iend;                                                   
    istart = (group_id * num_wrk_items + local_id) * niters;            
    iend   = istart+niters;                                             
    for(i = istart; i<iend; i=i+4){                                       
        x = ((float4)i+ramp)*step_size;                                    
        psum_vec=four/(one + x*x);
        accum += psum_vec.s0 + psum_vec.s1 + psum_vec.s2 + psum_vec.s3;
    }
    local_sums[local_id] = accum;

    barrier(CLK_LOCAL_MEM_FENCE);
    if (local_id == 0){
        sum = 0.0;
        for(i=0; i<num_wrk_items;i++){
            sum += local_sums[i];
        }
        partial_sums[group_id] = sum;
    }
}

__kernel 
void pi_vec8(
             const int      niters,                                           
             const float    step_size,                                       
    __local        float*   local_sums,                                      
    __global       float*   partial_sums
)
{
    int num_wrk_items  = get_local_size(0);
    int local_id       = get_local_id(0);
    int group_id       = get_group_id(0);
    float sum, accum = 0.0,tmp;

    float8 x, psum_vec;                                                  
    float8 ramp={0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5};               
    float8 four={4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0};               
    float8 one ={1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};               
                                                                       
    int i,istart,iend;                                                   
    istart = (group_id * num_wrk_items + local_id) * niters;
    iend   = istart+niters;
    for(i=istart; i<iend; i=i+8){
        x = ((float8)i+ramp)*step_size;
        psum_vec=four/(one + x*x);
        accum += psum_vec.s0 + psum_vec.s1 + psum_vec.s2 + psum_vec.s3 +
                 psum_vec.s4 + psum_vec.s5 + psum_vec.s6 + psum_vec.s7;
    }
    local_sums[local_id] = accum;
    barrier(CLK_LOCAL_MEM_FENCE);
    if (local_id == 0){
        sum = 0.0;
        for(i=0; i<num_wrk_items;i++){
            sum += local_sums[i];
        }
        partial_sums[group_id] = sum;
    }
}
