#include <stdio.h>


__global__ void hello(){

	printf("Hello from %d in block %d\n", threadIdx.x, blockIdx.x);

}


__global__ void addOne(int* arr, int n) {
	int id = blockIdx.x* blockDim.x + threadIdx.x; // global id of the thread
	if(id < n){
	 	arr[id] += 1;
	}
}


__global__ void addArr(int* a, int* b, int* c, int sz){
	
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	if(idx < sz){
		c[idx] = a[idx] + b[idx];
		
	}
	
}


int test1(){
	hello<<<2, 4>>>();
	cudaDeviceSynchronize();
	return 0;

}


void test2(){
	
	int n = 8;
	int arr_host[] = { 1, 2, 3, 4, 5, 6, 7, 8}; // in the CPU or host
	int *arr_gpu; // GPU or device
	cudaMalloc(&arr_gpu, n*sizeof(int));
	
	cudaMemcpy(arr_gpu, arr_host, n*sizeof(int), cudaMemcpyHostToDevice);

	addOne<<<1, 8>>>(arr_gpu, n);

	cudaMemcpy(arr_host, arr_gpu, n*sizeof(int), cudaMemcpyDeviceToHost);
	cudaFree(arr_gpu);

	for(int i = 0; i < n; i++){
		
		printf("%d", arr_host[i]);
	
	}


}


void test3(){
	
	int n = 1024;
	int *h_a = new int[n], *h_b = new int[n], *h_c = new int[n];
	for(int i = 0; i < n; i++){
		
		h_a[i] = i;
		h_b[i] = i*2;

	}

	int *d_a, *d_b, *d_c;
	cudaMalloc(&d_a, n*sizeof(int)); 
        cudaMalloc(&d_b, n*sizeof(int)); 
        cudaMalloc(&d_c, n*sizeof(int)); 

	cudaMemcpy(d_a, h_a, n*sizeof(int), cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, h_b, n*sizeof(int), cudaMemcpyHostToDevice);

	addArr<<<4, 256>>>(d_a, d_b, d_c, n);
	
	cudaMemcpy(h_c, d_c, n*sizeof(int), cudaMemcpyDeviceToHost);

	printf("c[5] = %d\n", h_c[5]);


}


int main(){

test3();
return 0;

}