Improving parallelism in fragment shader leading to more than 20x the framerate

This commit is contained in:
2019-12-26 23:07:03 +02:00
parent 2e6ddd2ff2
commit 6cef4df543
6 changed files with 48 additions and 27 deletions
+28 -11
View File
@@ -27,7 +27,7 @@ const char* fragment_shader_kernel_source =
"} \n" "} \n"
" \n" " \n"
"__kernel void fragment_shader ( \n" "__kernel void fragment_shader ( \n"
" __global int* faces, \n" " __global int3* faces, \n"
" __global float* vertices, \n" " __global float* vertices, \n"
" __global int* pixels, \n" " __global int* pixels, \n"
" __global int* screen_width, \n" " __global int* screen_width, \n"
@@ -39,8 +39,9 @@ const char* fragment_shader_kernel_source =
" __global float* norms_buff, \n" " __global float* norms_buff, \n"
" __global uchar* diffuse_map \n" " __global uchar* diffuse_map \n"
") { \n" ") { \n"
" int index = get_global_id(0); \n" " int GROUP_ID = get_group_id(0); \n"
" \n" " int GROUP_SIZE = get_local_size(0); \n"
" int LOCAL_ID = get_local_id(0); \n"
" \n" " \n"
" bool out = true; \n" " bool out = true; \n"
" float3 vertices3[3]; \n" " float3 vertices3[3]; \n"
@@ -48,18 +49,18 @@ const char* fragment_shader_kernel_source =
" float3 norms[3]; \n" " float3 norms[3]; \n"
" \n" " \n"
" for(int i = 0; i < 3; i++) { \n" " for(int i = 0; i < 3; i++) { \n"
" float4 vertex = vertices [faces[(index * 9) + (3 * i) + (0)]]; \n" " float4 vertex;// = vertices[faces[GROUP_ID * 3 + i ][0]]; \n"
" for(int j = 0; j < 4; j ++) { \n" " for(int j = 0; j < 4; j ++) { \n"
" vertex[j] = vertices[4 * faces[(index*9) + (3 * i) + (0)] + j]; \n" " vertex[j] = vertices[4 * faces[GROUP_ID*3 + i][0] + j]; \n"
" } \n" " } \n"
" \n" " \n"
" for(int j = 0; j < 2; j++) { \n" " for(int j = 0; j < 2; j++) { \n"
" uv_coords[i][j] = uv_buffer [2 * faces[(index * 9) + (3 * i) + (1)] + j]; \n" " uv_coords[i][j] = uv_buffer [2 * faces[GROUP_ID * 3 + i][1] + j]; \n"
" } \n" " } \n"
" \n" " \n"
" for( int j = 0; j < 3; j++ ) { \n" " for( int j = 0; j < 3; j++ ) { \n"
" vertices3[j][i] = (vertex[j]/vertex[3]); \n" " vertices3[j][i] = (vertex[j]/vertex[3]); \n"
" norms[j][i] = norms_buff[3 * (faces[(index * 9) + (3 * i) + (2)]) + j]; \n" " norms[j][i] = norms_buff[3 * (faces[GROUP_ID * 3 + i][2]) + j]; \n"
" } \n" " } \n"
" \n" " \n"
" if ( vertices3[0][i] > 0 && vertices3[0][i] < *screen_width \n" " if ( vertices3[0][i] > 0 && vertices3[0][i] < *screen_width \n"
@@ -82,10 +83,21 @@ const char* fragment_shader_kernel_source =
" } \n" " } \n"
" } \n" " } \n"
" \n" " \n"
" float3 point; \n" " if(bounding_box_min[0] > *screen_width || bounding_box_max[0] < 0 || bounding_box_min[1] > *screen_width || bounding_box_max[1] < 0) return; \n"
" for(point[0] = bounding_box_min[0]; point[0] <= bounding_box_max[0]; point[0]++) { \n"
" for(point[1] = bounding_box_min[1]; point[1] <= bounding_box_max[1]; point[1]++) { \n"
" \n" " \n"
" \n"
" int X_PER_ITEM = (int)(ceil((float)(bounding_box_max[0] - bounding_box_min[0]) / 16.f)); \n"
" int Y_PER_ITEM = (int)(ceil((float)(bounding_box_max[1] - bounding_box_min[1]) / 16.f)); \n"
" float STARTING_X = bounding_box_min[0] + X_PER_ITEM * (LOCAL_ID % 16); \n"
" float ENDING_X = STARTING_X + X_PER_ITEM; \n"
" float STARTING_Y = bounding_box_min[1] + Y_PER_ITEM * (LOCAL_ID / 16); \n"
" float ENDING_Y = STARTING_Y + Y_PER_ITEM; \n"
" \n"
" \n"
" float3 point; \n"
" for(point[0] = STARTING_X; point[0] <= ENDING_X; point[0]++) { \n"
" for(point[1] = STARTING_Y; point[1] <= ENDING_Y; point[1]++) { \n"
" if(point[1] >= *screen_width || point[1] >= *screen_width) break; \n"
" float3 bc_coord = barycentric(vertices3, point); \n" " float3 bc_coord = barycentric(vertices3, point); \n"
" if (bc_coord[0] < 0 || bc_coord[1] < 0 || bc_coord[2] < 0) continue; \n" " if (bc_coord[0] < 0 || bc_coord[1] < 0 || bc_coord[2] < 0) continue; \n"
" \n" " \n"
@@ -114,13 +126,18 @@ const char* fragment_shader_kernel_source =
" float3 normalized_norm = normalize(normal); \n" " float3 normalized_norm = normalize(normal); \n"
" \n" " \n"
" \n" " \n"
" float intensity = clamp((dot(normalized_norm , (float3){light_dir[0], light_dir[1], light_dir[2]})), 0.f, 1.f); \n" " float intensity = clamp((dot(normalized_norm , (float3){light_dir[0], light_dir[1], light_dir[2]})), 0.f, 1.f) + 0.2; \n"
" \n" " \n"
" int color = 0; \n" " int color = 0; \n"
" color |= ((int)fmin((float)(diffuse_map[col_index + 0]) * intensity, (float) 0xff)) << 16; \n" " color |= ((int)fmin((float)(diffuse_map[col_index + 0]) * intensity, (float) 0xff)) << 16; \n"
" color |= ((int)fmin((float)(diffuse_map[col_index + 1]) * intensity, (float) 0xff)) << 8; \n" " color |= ((int)fmin((float)(diffuse_map[col_index + 1]) * intensity, (float) 0xff)) << 8; \n"
" color |= ((int)fmin((float)(diffuse_map[col_index + 2]) * intensity, (float) 0xff)) << 0; \n" " color |= ((int)fmin((float)(diffuse_map[col_index + 2]) * intensity, (float) 0xff)) << 0; \n"
" \n" " \n"
" //color |= ((int)fmin((float)(0xff) * intensity, (float) 0xff)) << 16; \n"
" //color |= ((int)fmin((float)(0xff) * intensity, (float) 0xff)) << 8; \n"
" //color |= ((int)fmin((float)(0xff) * intensity, (float) 0xff)) << 0; \n"
" \n"
" \n"
" z_buffer[(int) (point[0] + point[1] * *screen_width)] = point[2]; \n" " z_buffer[(int) (point[0] + point[1] * *screen_width)] = point[2]; \n"
" pixels [(int) (point[0] + point[1] * *screen_width)] = color;// & 0x00ffffff; \n" " pixels [(int) (point[0] + point[1] * *screen_width)] = color;// & 0x00ffffff; \n"
" } \n" " } \n"
+6 -5
View File
@@ -107,7 +107,7 @@ void vertex_shader(float* z, float* vertices, int vertex_count, float* new_verti
{ {
mat_z = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * MATRIX_SIZE , NULL, &err); mat_z = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * MATRIX_SIZE , NULL, &err);
vertices_mem = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * vertex_count * 3, NULL, &err); vertices_mem = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * vertex_count * 3, NULL, &err);
new_vertices_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * vertex_count * 4, NULL, &err); new_vertices_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float4) * vertex_count, NULL, &err);
err = clSetKernelArg(vertex_shader_kernel, 0, sizeof(cl_mem), &mat_z); err = clSetKernelArg(vertex_shader_kernel, 0, sizeof(cl_mem), &mat_z);
err = clSetKernelArg(vertex_shader_kernel, 1, sizeof(cl_mem), &vertices_mem); err = clSetKernelArg(vertex_shader_kernel, 1, sizeof(cl_mem), &vertices_mem);
@@ -134,7 +134,7 @@ void clear(cl_mem* buffer, size_t size, const int pattern) {
} }
void fragment_shader( void fragment_shader(
int* faces, cl_int3* faces,
int nfaces, int nfaces,
float* uv, float* uv,
size_t uv_size, size_t uv_size,
@@ -149,7 +149,7 @@ void fragment_shader(
) { ) {
if (!fragment_shader_buffers_initialized) { if (!fragment_shader_buffers_initialized) {
faces_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(int) * 3 * 3 * nfaces , NULL, &err); faces_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(cl_int3) * 3 * nfaces , NULL, &err);
pixel_data_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * screen_height * screen_width , NULL, &err); pixel_data_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * screen_height * screen_width , NULL, &err);
screen_width_mem = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(int) , NULL, &err); screen_width_mem = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(int) , NULL, &err);
z_buffer_mem = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * screen_height * screen_width, NULL, &err); z_buffer_mem = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * screen_height * screen_width, NULL, &err);
@@ -174,7 +174,7 @@ void fragment_shader(
err = clSetKernelArg(fragment_shader_kernel, 10, sizeof(cl_mem), &diffuse_map_buffer); err = clSetKernelArg(fragment_shader_kernel, 10, sizeof(cl_mem), &diffuse_map_buffer);
err = clEnqueueWriteBuffer(commands, screen_width_mem , CL_FALSE, 0, sizeof(int) , &screen_width , 0, NULL, NULL); err = clEnqueueWriteBuffer(commands, screen_width_mem , CL_FALSE, 0, sizeof(int) , &screen_width , 0, NULL, NULL);
err = clEnqueueWriteBuffer(commands, faces_buffer , CL_FALSE, 0, sizeof(int) * 3 * 3 * nfaces , faces , 0, NULL, NULL); err = clEnqueueWriteBuffer(commands, faces_buffer , CL_FALSE, 0, sizeof(cl_int3) * 3 * nfaces , faces , 0, NULL, NULL);
err = clEnqueueWriteBuffer(commands, nfaces_mem , CL_FALSE, 0, sizeof(int) , &nfaces , 0, NULL, NULL); err = clEnqueueWriteBuffer(commands, nfaces_mem , CL_FALSE, 0, sizeof(int) , &nfaces , 0, NULL, NULL);
err = clEnqueueWriteBuffer(commands, uv_buffer , CL_FALSE, 0, uv_size , uv , 0, NULL, NULL); err = clEnqueueWriteBuffer(commands, uv_buffer , CL_FALSE, 0, uv_size , uv , 0, NULL, NULL);
err = clEnqueueWriteBuffer(commands, map_size_buffer , CL_FALSE, 0, sizeof(int) * 2 , map_size , 0, NULL, NULL); err = clEnqueueWriteBuffer(commands, map_size_buffer , CL_FALSE, 0, sizeof(int) * 2 , map_size , 0, NULL, NULL);
@@ -190,7 +190,8 @@ void fragment_shader(
clear(&z_buffer_mem, sizeof(float) * screen_width * screen_height, 0); clear(&z_buffer_mem, sizeof(float) * screen_width * screen_height, 0);
size_t fragment_shader_global[] = { nfaces }; size_t fragment_shader_global[] = { nfaces * 256 };
size_t framgent_shader_local[] = { 256 };
err = clEnqueueNDRangeKernel(commands, fragment_shader_kernel, 1, NULL, fragment_shader_global, NULL, 0, NULL, NULL); err = clEnqueueNDRangeKernel(commands, fragment_shader_kernel, 1, NULL, fragment_shader_global, NULL, 0, NULL, NULL);
+1 -1
View File
@@ -35,7 +35,7 @@ extern const char* fragment_shader_kernel_source;
void init_kernels(); void init_kernels();
void vertex_shader(float* z, float* vertices, int vertex_count, float* new_vertices); void vertex_shader(float* z, float* vertices, int vertex_count, float* new_vertices);
void fragment_shader( void fragment_shader(
int* faces, cl_int3* faces,
int nfaces, int nfaces,
float* uv, float* uv,
size_t uv_size, size_t uv_size,
+2 -2
View File
@@ -75,14 +75,14 @@ bool HandleButtonPressed() {
void CALLBACK FixedUpdate(HWND hwnd, UINT message, UINT uInt, DWORD dWord) void CALLBACK FixedUpdate(HWND hwnd, UINT message, UINT uInt, DWORD dWord)
{ {
TIME += 0.167; TIME += 0.03333333;
HandleButtonPressed(); HandleButtonPressed();
camera.ApplyChanges(); camera.ApplyChanges();
clock_t start = clock(); clock_t start = clock();
render(); render();
Update();
clock_t end = clock(); clock_t end = clock();
printf("FPS: %f\n", 1/((float)(end-start)/CLOCKS_PER_SEC)); printf("FPS: %f\n", 1/((float)(end-start)/CLOCKS_PER_SEC));
Update();
} }
LRESULT CALLBACK WndProc(HWND hwnd, UINT message, WPARAM wParam, LPARAM lParam) LRESULT CALLBACK WndProc(HWND hwnd, UINT message, WPARAM wParam, LPARAM lParam)
+11 -8
View File
@@ -14,9 +14,9 @@
#define VERTICAL_CAMERA_CLAMP_UP 90 #define VERTICAL_CAMERA_CLAMP_UP 90
#define VERTICAL_CAMERA_CLAMP_DOWN -90 #define VERTICAL_CAMERA_CLAMP_DOWN -90
#define NEAR_CLIP_PLANE 1.f #define NEAR_CLIP_PLANE 1.f
#define FAR_CLIP_PLANE 200.0f #define FAR_CLIP_PLANE 2000.0f
#define FOV 50 #define FOV 50
#define CAMERA_MOVEMENT_SPEED .1f #define CAMERA_MOVEMENT_SPEED .7f
#define DEFAULT_CAMERA_POS Vec3f(0, 0, 5) #define DEFAULT_CAMERA_POS Vec3f(0, 0, 5)
#define DEFAULT_CAMERA_ROT Vec3f(0, 0, 0) #define DEFAULT_CAMERA_ROT Vec3f(0, 0, 0)
#define LIGHT_INTENSITY 1.5 #define LIGHT_INTENSITY 1.5
@@ -30,13 +30,13 @@ Matrix ViewPort = Matrix::identity();
Matrix ModelView = Matrix::identity(); Matrix ModelView = Matrix::identity();
Matrix Projection = Matrix::identity(); Matrix Projection = Matrix::identity();
Model* model = new Model("african_head.obj"); Model* model = new Model("sakura.obj");
Camera camera; Camera camera;
Vec3f light_dir = Vec3f(1, 1, 1).normalize(); Vec3f light_dir = Vec3f(1, 1, 1).normalize();
float* new_verts = (float*)malloc(4 * sizeof(float) * model->nverts()); float* new_verts = (float*)malloc(4 * sizeof(float) * model->nverts());
int* faces = (int*)malloc(3 * 3 * sizeof(int) * model->nfaces()); cl_int3* faces = (cl_int3*)malloc(3 * sizeof(cl_int3) * model->nfaces());
bool init_flag = false; bool init_flag = false;
@@ -96,10 +96,13 @@ void render()
if (!init_flag) { if (!init_flag) {
//light_dir = camera.GetForward().normalize() * -1; //light_dir = camera.GetForward().normalize() * -1;
viewport(0, 0, screen_width, screen_height, FAR_CLIP_PLANE, NEAR_CLIP_PLANE); viewport(0, 0, screen_width, screen_height, FAR_CLIP_PLANE, NEAR_CLIP_PLANE);
for (int i = 0; i < model->nfaces(); i++) for (int i = 0; i < model->nfaces(); i++) {
for (int j = 0; j < 3; j++) for (int j = 0; j < 3; j++) {
for (int k = 0; k < 3; k++) faces[i * 3 + j].x = model->faces_[i][j][0];
faces[i * 9 + j * 3 + k] = model->faces_[i][j][k]; faces[i * 3 + j].y = model->faces_[i][j][1];
faces[i * 3 + j].z = model->faces_[i][j][2];
}
}
init_flag = true; init_flag = true;
} }
Binary file not shown.