"#define THREAD_W 1 \n"
"#define THREAD_H 32 \n"
" \n"
"#define MAX_ISEC_COUNT 24 \n"
" \n"
"/* Texture to output resultats */ \n"
"//texture<float, 1, cudaReadModeElementType> out_tex; \n"
" \n"
"/**	\\struct Vec3. \n"
"\\brief Class representing 3 dimensional vector. CUDA version. \n"
"*/ \n"
" \n"
"/**	\\struct ct_state. \n"
"	\\brief Structure describe current CT state. \n"
"*/ \n"
"struct ct_state{ \n"
"	float4 o;///< rays origin \n"
"	float4 c;///< corner of plate \n"
"	float4 dx;///< dx of plane in 3d \n"
"	float4 dy;///< dy of plane in 3d \n"
"}; \n"
" \n"
"//**	The axis-aligned bounding box \n"
"struct aabb { \n"
"	float x1,x2,y1,y2,z1,z2; \n"
"}; \n"
" \n"
"int isInside(struct aabb * box, float4 * v) \n"
"{ \n"
"	if((*v).x<=box->x2) \n"
"		if((*v).x>=box->x1)  \n"
"			if((*v).y<=box->y2) \n"
"				if((*v).y>=box->y1)  \n"
"					if((*v).z<=box->z2) \n"
"						if((*v).z>=box->z1) \n"
"							return 1; \n"
"	return 0; \n"
"} \n"
" \n"
"/**	\\struct BSPNode. \n"
"\\brief Class representing a BSP-tree node. \n"
" \n"
"BSP-tree node.	 \n"
"*/ \n"
" \n"
" \n"
"struct BSPNode \n"
"{ \n"
"	union { \n"
"		unsigned int tri_start; \n"
"		unsigned int offset; \n"
"	}; \n"
"	union { \n"
"		unsigned int tri_count; \n"
"		float splitCoord; \n"
"	}; \n"
"	unsigned int masked_vars; \n"
"}; \n"
" \n"
"#define isLeaf(node)	(bool)( (node->masked_vars) & 0x00000080 ) \n"
"#define has_left(node)	(bool)( (node->masked_vars) & 0x00000040 ) \n"
"#define has_right(node)	(bool)( (node->masked_vars) & 0x00000020 ) \n"
"#define axisInd(node)	(int) ( (node->masked_vars) & 0x00000003 ) \n"
" \n"
" \n"
"/**	\\struct traverse_stack. \n"
"	\\brief Used for tree traversal. \n"
" \n"
"	CUDA version.	 \n"
"*/ \n"
"struct trace_t  \n"
"{ \n"
"	//trace_t(unsigned int aa, float a, float b):node(aa),tmin(a),tmax(b){} \n"
"	unsigned int node; \n"
"	float tmin; \n"
"	float tmax; \n"
"}; \n"
" \n"
"struct traverse_stack \n"
"{ \n"
"	int index; \n"
"	struct trace_t t[30]; \n"
"}; \n"
" \n"
"#define STACK_GET(stack) ( &( stack.t[ stack.index - 1 ] ) ) \n"
"#define STACK_PUSH(stack, a_node, a_tmin, a_tmax)	{ stack.t[ stack.index ].node = a_node;	stack.t[ stack.index ].tmin = a_tmin; stack.t[ stack.index ].tmax = a_tmax;	stack.index++; } \n"
"#define STACK_POP(stack) stack.index--; \n"
" \n"
"/**	\\struct wald_tri. \n"
"\\brief Structure representing a triangle data needed for intersection test. \n"
" \n"
"Wald triangle structure.	 \n"
"*/ \n"
" \n"
"struct wald_tri \n"
"{ \n"
"	float N[3]; \n"
"	float A[3]; \n"
"	float nu, nv, nd; \n"
"	unsigned int k; \n"
"	float bnu, bnv, cnu, cnv; \n"
"}; \n"
" \n"
"#define wt_N			(float4)(wt->N[0], wt->N[1], wt->N[2], 0.0f) \n"
"#define wt_A			(float4)(wt->A[0], wt->A[1], wt->A[2], 0.0f) \n"
" \n"
"/**	\\class Ray. \n"
"\\brief Class representing ray in 3D. \n"
" \n"
"OpenCL version.	 \n"
"*/ \n"
"struct Ray \n"
"{ \n"
"	float4 o;	///< ray origin's position \n"
"	float4 dir;	///< ray direction vector \n"
"}; \n"
" \n"
"/**	\\struct intersection. \n"
"\\brief Structure representing intersection data. \n"
" \n"
"Contains data about primitive. OpenCL version	 \n"
"*/ \n"
"struct intersection \n"
"{ \n"
"	unsigned int prim_ind; \n"
"	float dist; \n"
"}; \n"
" \n"
" \n"
"/** \n"
"* Ray-AABB intersection routine. CUDA version. \n"
"* @param ray ray class. \n"
"* @param box axis aligned bounding box structure. \n"
"* @return  \n"
"1 - if ray intersects AABB \n"
"0 - otherwise \n"
"*/ \n"
"static inline unsigned int IntersectRayAABB(struct Ray * ray, __global struct aabb * box, float tmin, float tmax) \n"
"{ \n"
"	float l1 = (box->x1 - ray->o.x) / ray->dir.x; \n"
"	float l2 = (box->x2 - ray->o.x) / ray->dir.x; \n"
"	tmin = fmax(fmin(l1,l2), tmin); \n"
"	tmax = fmin(fmax(l1,l2), tmax); \n"
"	l1 = (box->y1 - ray->o.y) / ray->dir.y; \n"
"	l2 = (box->y2 - ray->o.y) / ray->dir.y; \n"
"	tmin = fmax(fmin(l1,l2), tmin); \n"
"	tmax = fmin(fmax(l1,l2), tmax); \n"
"	l1 = (box->z1 - ray->o.z) / ray->dir.z; \n"
"	l2 = (box->z2 - ray->o.z) / ray->dir.z; \n"
"	tmin = fmax(fmin(l1,l2), tmin); \n"
"	tmax = fmin(fmax(l1,l2), tmax); \n"
"	return ((tmax >= tmin) & (tmax >= 0.f)); \n"
"} \n"
"/** \n"
"* Ray-AABB intersection routine. \n"
"* checks which subnodes' AABBs are intersected by ray. CUDA version. \n"
"* @param ray ray class. \n"
"* @param tmin parent AABB min t. \n"
"* @param tmin parent AABB max t. \n"
"* @param split split plane's coordinates \n"
"* @param splitIndex index of splitting axis \n"
"* @param t [out] split plane's t \n"
"* @return  \n"
"0 - left node intersected \n"
"1 - both nodes intersected \n"
"2 - right node intersected \n"
"*/ \n"
"inline int GetIntersectionState(struct Ray * ray,  \n"
"								const float tmin,  \n"
"								const float tmax,  \n"
"								const float split,  \n"
"								const int splitIndex,  \n"
"								float * t) \n"
"{	 \n"
"	float rd = ((float*)(&ray->dir))[splitIndex]; \n"
"	float ro = ((float*)(&ray->o))[splitIndex]; \n"
"	if(!rd) \n"
"		rd = 0.0000000001f; \n"
"	(*t) = (split - ro) / rd; \n"
"	const int sign = (rd >= 0.0f); \n"
"	if((*t) < tmin)  \n"
"		return (sign^0); \n"
"	if((*t) > tmax)  \n"
"		return (sign^1); \n"
"	return 2; \n"
"} \n"
" \n"
"//** Ray vs Wald Tri intersection routine \n"
"// Intersection method return values \n"
"#define HIT		 1		// Ray hit primitive \n"
"#define MISS	 0		// Ray missed primitive \n"
"#define INPRIM	-1		// Ray started inside primitive \n"
" \n"
"#define ku modulo[wt->k + 1] \n"
"#define kv modulo[wt->k + 2] \n"
"static __constant unsigned int modulo[] = { 0, 1, 2, 0, 1 }; \n"
" \n"
"inline int IntersectRayWTri( struct Ray * a_Ray, __global struct wald_tri * wt, float * a_Dist, float * a_Dip ) \n"
"{ \n"
"	float * prt_dir = &a_Ray->dir; \n"
"	float * prt_o = &a_Ray->o; \n"
"	const float dir_wt_k = prt_dir[wt->k]; \n"
"	const float dir_ku = prt_dir[ku]; \n"
"	const float dir_kv = prt_dir[kv]; \n"
"	const float o_wt_k = prt_o[wt->k]; \n"
"	const float o_ku = prt_o[ku]; \n"
"	const float o_kv = prt_o[kv]; \n"
"	float4 A = wt_A; \n"
"	float * ptr_A = &A; \n"
"	const float wt_A_ku = ptr_A[ku]; \n"
"	const float wt_A_kv = ptr_A[kv]; \n"
" \n"
"	const float lnd = 1.0f / (dir_wt_k + wt->nu * dir_ku + wt->nv * dir_kv); \n"
"	const float t = (wt->nd - o_wt_k - wt->nu * o_ku - wt->nv * o_kv) * lnd; \n"
"	if (!((*a_Dist) > t && t > 0)) return MISS; \n"
"	const float hu = o_ku + t*dir_ku - wt_A_ku; \n"
"	const float hv = o_kv + t*dir_kv - wt_A_kv; \n"
"	const float beta = hv * wt->bnu + hu * wt->bnv; \n"
"	if (beta < 0) return MISS; \n"
"	const float gamma = hu * wt->cnu + hv * wt->cnv; \n"
"	if (gamma < 0) return MISS; \n"
"	if ((beta + gamma) > 1) return MISS; \n"
"	(*a_Dist) = t; \n"
"	(*a_Dip) = dot(a_Ray->dir, wt_N); \n"
"	return ( (*a_Dip) > 0 ) ? INPRIM : HIT; \n"
"} \n"
" \n"
" \n"
"//** Trace the ray inside the tree \n"
"int trace_tree(	struct Ray * ray,  \n"
"				float * res,  \n"
"				float * res_dip, \n"
"				 \n"
"				__global struct aabb * cl_aabb, \n"
"				__global struct aabb * cut_aabbs, \n"
"				unsigned int cut_aabbs_count, \n"
"				 \n"
"				__global struct BSPNode * nodes, \n"
"				__global struct wald_tri * tris, \n"
"				__global unsigned int * ids) \n"
"{ \n"
"	float cur_tmin = 0; \n"
"	float cur_tmax = 100000.f; \n"
"	//check main AABB \n"
"	bool intersects = IntersectRayAABB(ray, cl_aabb, cur_tmin, cur_tmax); \n"
"	if(!intersects) \n"
"		return 0; \n"
"	//check region-of-interest AABBs \n"
"	if(intersects && cut_aabbs_count != 0) \n"
"	{ \n"
"		intersects = false; \n"
"		for (unsigned int i=0; i<cut_aabbs_count; i++) \n"
"		{ \n"
"			float a=0.f, b=100000.f; \n"
"			if(IntersectRayAABB(ray, &cut_aabbs[i], a, b))  \n"
"			{ \n"
"				intersects = true; \n"
"				break; \n"
"			} \n"
"		} \n"
"	} \n"
"	if(!intersects) \n"
"		return 0; \n"
" \n"
"	struct intersection intersections[MAX_ISEC_COUNT];//intersection intersections[20]; \n"
"	intersections[0].dist = 0;//intersections[0].dist = res; \n"
" \n"
"	struct traverse_stack tr_stack; \n"
"	tr_stack.index = 0; \n"
"	//const unsigned int resint = Intersect(ray, cl_aabb, cur_tmin, cur_tmax); \n"
"	unsigned int cur_node_id = 0; \n"
"	STACK_PUSH( tr_stack, cur_node_id, cur_tmin, cur_tmax); \n"
" \n"
"	__global struct BSPNode * cur_node; \n"
"	__global struct wald_tri * cur_tri; \n"
"	unsigned int sign = 0; \n"
"	unsigned int isec_count = 0; \n"
" \n"
"	while (tr_stack.index > 0) \n"
"	{ \n"
"		cur_node_id		= STACK_GET(tr_stack)->node; \n"
"		cur_tmin		= STACK_GET(tr_stack)->tmin; \n"
"		cur_tmax		= STACK_GET(tr_stack)->tmax; \n"
"		STACK_POP(tr_stack); \n"
"		cur_node = &nodes[cur_node_id]; \n"
"		if(isLeaf(cur_node)) \n"
"		{ \n"
"			for (unsigned int i=0; i<cur_node->tri_count; i++) \n"
"			{ \n"
"				float a_Dist = 1000000.0f, a_Dip; \n"
"				unsigned int indx = ids[i + cur_node->tri_start]; \n"
"				cur_tri = &tris[indx];//read the triangle \n"
" \n"
"				if (IntersectRayWTri( ray, cur_tri, &a_Dist, &a_Dip ))  \n"
"				{ \n"
"					bool again = false; \n"
"					for (unsigned int i=0; i<isec_count; i++) \n"
"						if(intersections[i].prim_ind == indx) \n"
"							again = true; \n"
"					if(again) \n"
"						continue; \n"
"					 \n"
"					(*res_dip) += fabs(a_Dip); \n"
"					intersections[isec_count].prim_ind = indx; \n"
"					intersections[isec_count].dist = a_Dist; \n"
"					isec_count++; \n"
"					if(isec_count == MAX_ISEC_COUNT) \n"
"						goto MAX_INTERSECTIONS_HIT; \n"
"				} \n"
"			} \n"
"		} \n"
"		else  \n"
"		{ \n"
"			float t; \n"
"			const int resisec = GetIntersectionState(ray,  \n"
"				cur_tmin, cur_tmax, \n"
"				cur_node->splitCoord, axisInd(cur_node), &t); \n"
"			switch(resisec) \n"
"			{ \n"
"			case 0://ray intersects left only \n"
"				if(has_left(cur_node)) \n"
"					STACK_PUSH(tr_stack, cur_node->offset, cur_tmin, cur_tmax); \n"
"				break; \n"
"			case 1://ray intersects right only \n"
"				if(has_right(cur_node)) \n"
"					STACK_PUSH(tr_stack, cur_node->offset+1, cur_tmin, cur_tmax); \n"
"				break; \n"
"			case 2://ray intersects left and right \n"
"				sign = ((float*)&ray->dir)[axisInd(cur_node)] >= 0.0f; \n"
"				if(sign) \n"
"				{ \n"
"					if(has_left(cur_node)) \n"
"						STACK_PUSH(tr_stack, cur_node->offset, cur_tmin, t); \n"
"					if(has_right(cur_node)) \n"
"						STACK_PUSH(tr_stack, cur_node->offset+1, t, cur_tmax); \n"
"				} \n"
"				else \n"
"				{ \n"
"					if(has_right(cur_node)) \n"
"						STACK_PUSH(tr_stack, cur_node->offset+1, cur_tmin, t); \n"
"					if(has_left(cur_node)) \n"
"						STACK_PUSH(tr_stack, cur_node->offset, t, cur_tmax);	 \n"
"				} \n"
"				break; \n"
"			} \n"
"		} \n"
"	} \n"
" \n"
"MAX_INTERSECTIONS_HIT: \n"
" \n"
"	//sort intersections in one node \n"
"	for(unsigned int i = 0; i < isec_count; i++) \n"
"	{ \n"
"		float tmp; \n"
"		unsigned char swapped=0; \n"
"		for(unsigned int j = 0; j < isec_count-1-i; j++) \n"
"		{ \n"
"			if( intersections[j].dist > intersections[j+1].dist ) \n"
"			{ \n"
"				tmp = intersections[j].dist; \n"
"				intersections[j].dist = intersections[j+1].dist; \n"
"				intersections[j+1].dist = tmp; \n"
"				if(swapped==0) swapped=1; \n"
"			} \n"
"		} \n"
"		if(swapped == 0) break; \n"
"	} \n"
"	unsigned int n=0; \n"
"	if(isec_count % 2 == 0) //TODO: temporary workaround \n"
"		for (unsigned int i=0; i<isec_count; i++) \n"
"			if((i+n)%2==1) \n"
"				(*res) += intersections[i].dist - intersections[i-1].dist; \n"
"	if(isec_count)  \n"
"		(*res_dip) /= isec_count; \n"
" \n"
"	return 1; \n"
"} \n"
" \n"
"#define float4_at(data, index) (float4)(data[index], data[index+1], data[index+2], 0.0f) \n"
"//** Perform the batch ray casting \n"
"__kernel void raycast_batch( \n"
"					int w, int h, \n"
"					__global float * a_out_data,  \n"
"					__global float * out_dip, \n"
"					 \n"
"					__global float * os, \n"
"					__global float * cs, \n"
"					__global float * dxs, \n"
"					__global float * dys, \n"
"					__global struct aabb * cl_aabb, \n"
"					__global struct aabb * cut_aabbs, \n"
"					unsigned int cut_aabbs_count, \n"
"					 \n"
"					__global struct BSPNode * nodes, \n"
"					__global struct wald_tri * tris, \n"
"					__global unsigned int * ids) \n"
"{ \n"
"	const int vIdx = get_global_id(0); \n"
"	const int batch = vIdx / (h*w); \n"
"	const int y = ( vIdx - (batch*w*h) ) / h; \n"
"	const int x = vIdx % w; \n"
"	int pos = 3 * batch; \n"
"	float4 dir =	float4_at(cs, pos)  \n"
"				+ x*float4_at(dxs, pos)  \n"
"				+ y*float4_at(dys, pos) \n"
"				-	float4_at(os, pos); \n"
"	dir = normalize( dir ); \n"
"	struct Ray r; \n"
"	r.o = float4_at(os, pos); \n"
"	r.dir = dir;//( o, dir ); \n"
"	float res = 0.0f, res_dip = 0.0f; \n"
" \n"
"	trace_tree(	&r, &res, &res_dip, \n"
"				cl_aabb, \n"
"				cut_aabbs, cut_aabbs_count, \n"
"				nodes, tris, ids); \n"
" \n"
" 	a_out_data	[vIdx] = res; \n"
" 	out_dip		[vIdx] = res_dip; \n"
"} \n"
