I am trying to calculate vertex norms with a compute shader but I seem to have a timing error

I accumulate the normals from all the faces adjacent to a vertex with this code
Code :
#version 430 core
 
 
struct vertex_in_struct
{
  float x;
  float y;
  float z;
  uint  nx;
  uint  ny;
  uint  nz;
};
 
 
layout( std430, binding=4 ) buffer Vertices
{
  vertex_in_struct Vertex[ ]; // array of structures
};
 
 
layout( std430, binding=5 ) readonly buffer Faces
{
  int Face[ ]; // array of structures
};
 
 
//uniform int   u_Group_Offset_X;
uniform uint  u_FaceCount;
 
 
layout( local_size_x = 128, local_size_y = 1, local_size_z = 1 ) in;
 
 
void main()
{
  uint gid = gl_GlobalInvocationID.x; // +u_Group_Offset_X; // the .y and .z are both 1 in this case
 
 
  if (gid < u_FaceCount)
  {
    int vi[3];
    vec3 v[3];
 
 
    // get vertex indices
    vi[0] = Face[gid*3];
    vi[1] = Face[gid*3+1];
    vi[2] = Face[gid*3+2];
 
 
    // get the vertex position
    for (int i = 0; i < 3; i++)
    {
      float vx = Vertex[vi[i]].x;
      float vy = Vertex[vi[i]].y;
      float vz = Vertex[vi[i]].z;
 
 
      v[i] = vec3(vx,vy,vz);
    }
 
 
    // calculate the face normal weighted by triangle size
    vec3 norm = (cross((v[2]-v[0]),(v[1]-v[0])));
 
 
    memoryBarrierBuffer();
 
 
    // accumulate normal
    uint prevVal; 
    uint newVal ;
 
 
    for (int i = 0; i < 3; i++)
    {
      // currently no atomic float add except on  some nVidia gpus
      do
      {
        prevVal = Vertex[vi[i]].nx;
        newVal = floatBitsToUint(( norm.x + uintBitsToFloat( prevVal )));
      } while( ( atomicCompSwap( Vertex[vi[i]].nx , prevVal , newVal ) ) != prevVal );
      do
      {
        prevVal = Vertex[vi[i]].ny;
        newVal = floatBitsToUint(( norm.y + uintBitsToFloat( prevVal )));
      } while( ( atomicCompSwap( Vertex[vi[i]].ny , prevVal , newVal ) ) != prevVal );
      do
      {
        prevVal = Vertex[vi[i]].nz;
        newVal = floatBitsToUint(( norm.z + uintBitsToFloat( prevVal )));
      } while( ( atomicCompSwap( Vertex[vi[i]].nz , prevVal , newVal ) ) != prevVal );  
    }
 
 
  }
}

Then I normalise with this

Code :
#version 430 core
 
 
struct vertex_in_struct
{
  float x;
  float y;
  float z;
  float nx;
  float ny;
  float nz;
};
 
 
layout( std430, binding=4 ) buffer Vertices
{
  vertex_in_struct Vertex[ ];
};
 
 
uniform int   u_Group_Offset_X;
uniform uint  u_VertexCount;
 
 
layout( local_size_x = 128, local_size_y = 1, local_size_z = 1 ) in;
 
 
void main()
{
  uint gid = gl_GlobalInvocationID.x+u_Group_Offset_X; // the .y and .z are both 1 in this case
 
 
  if (gid < u_VertexCount)
  {
    vec3 norm = normalize(vec3(Vertex[gid].nx,Vertex[gid].ny,Vertex[gid].nz));
 
 
    memoryBarrierBuffer();
 
 
    Vertex[gid].nx = norm.x;
    Vertex[gid].ny = norm.y;
    Vertex[gid].nz = norm.z;
  }
}


I used a shader to display the normals at each vertex. Most of my normals look right but some appear to be un-normalised.
If I normalise them in the display shader they all look fine.

I have tried memoryBarrier and memoryBarrierBuffer to ensure load/modifiy/store to the buffers stays in sink.