Compute shader wrong output, possible synchronization error

Hey there,
I’m having a trouble with my compute shader, I’m not really sure what causes the problem, but I think I’m missing something with thread synchronization. I’m doing a simple reduction:

   #version 430 core
    
    #define SIZE 256
    #define CLUSTERS 5
    
    layout(local_size_x = 16, local_size_y = 16, local_size_z = 1) in;
    
    layout(binding = 0) uniform sampler2D normals;
    uniform ivec2 u_texSize;
    
    struct Cluster {
    	vec3 cntr;
    	uint size;
    };
    
    coherent layout(std430, binding = 0) buffer destBuffer {
    	Cluster clusters[CLUSTERS];
    	bool moving;
    } ob;
    
    shared uint sizeCache[SIZE];
    
    void main() {
    	const ivec2 pos = ivec2(gl_GlobalInvocationID.xy);
    	const vec2 coords = vec2(pos.x / float(u_texSize.x), pos.y / float(u_texSize.y));
    	const vec3 N = texture2D(normals, coords).rgb;
    	const bool isNormal = length(N) != 0;
    
    	const uint id = pos.y * (gl_WorkGroupSize.x + gl_NumWorkGroups.x) + pos.x;
    	
    	barrier();
    	sizeCache[gl_LocalInvocationIndex] = int(isNormal);
    	int stepv = (SIZE >> 1);
    	while(stepv > 0) {
    		if (gl_LocalInvocationIndex < stepv) {
    			sizeCache[gl_LocalInvocationIndex] += sizeCache[gl_LocalInvocationIndex + stepv];
    		}
    		memoryBarrierShared();
    		barrier();
    		stepv = (stepv >> 1);
    	}
    
    	if (gl_LocalInvocationIndex == 0) {
    		atomicAdd(ob.clusters[0].size, sizeCache[0]);
    	}
    	barrier();
    	memoryBarrier();

    	if(id == 0) {
    		ob.clusters[0].size = 13; //this has no effect
    		ob.clusters[1].size = 1;  //this works
    	}
    }

The reduction works and produces correct result. However if I try to alter the variable after the reduction, it has no effect, the value doesn’t change. I guess that it is rewritten by previous atomicAdd from another thread, but I don’t understand why.

This is how I read the result on CPU:

	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, resultBuffer);
		
	glDispatchCompute(ceil(winSize.x / 16.f), ceil(winSize.y / 16.f), 1);
	glCheckError();
		
	glMemoryBarrier(GL_ALL_BARRIER_BITS); //for debug
	
	struct Cl {
		glm::vec3 cntr;
		uint size;
	};
	
	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, resultBuffer);
	
	std::vector<Cl> data(5);
	glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, sizeOfresult, &data[0]);

I have NVIDIA GT630M card and linux with nvidia proprietary driver (331.49).

Thanks for any help :slight_smile: