PDA

View Full Version : SSBO and VBO help



CaptainSnugglebottom
01-24-2018, 07:11 PM
Hello,

I decided to implement Batch Rendering using SSBO, where SSBO will contain an array of structs that contains all data for each object (lets say 100 objects). Along the way I also had to change the VBO in order to accommodate some changes from the previous pipeline.

The problem is, nothing renders. I doubled checked my math and everything checks out. glGetError() also showed no issues (perhaps I didn't check for the proper enumerator), so I figured to ask for the forum's opinion.


The setup is very simple:
- each struct has a bunch of flags and other data for each object.
- as the vertex data is pushed into VBO, the object's index in the array is also pushed into the VBO.
- shader accesses the struct using the object index that was acquired from vertex data
- shader uses the struct's data


This is the struct I am pushing into the SSBO:


struct objectVarsData {
float posVec[3]; // 12 12
float rotVec[3]; // 12 24
float scaleVec; // 4 28

int textureLayer; // 4 32

int drawEnable; // 4 36
int colorMapEnable; // 4 40
float colorVec[3]; // 12 52
float alphaScale; // 4 56

int normalMapEnable; // 4 60

int specularMapEnable; // 4 64

int lightMapEnable; // 4 68
float lightVec[3]; // 12 80
float lightScale; // 4 84

int controlEnable; // 4 88
float controlColorVec[3]; // 12 100
};

This is how initialize my SSBO and VBO:


// Object VBO
glGenBuffers(1, &(this->objectVBO));
glBindBuffer(GL_ARRAY_BUFFER, this->objectVBO);
glBufferData(GL_ARRAY_BUFFER, graphics2DMaximumVBOSize_Byte, NULL, GL_STATIC_DRAW);

glGenVertexArrays(1, &(this->objectVAO));
glBindVertexArray(this->objectVAO);
glBindBuffer(GL_ARRAY_BUFFER, this->objectVAO);
glVertexAttribPointer(0, graphicsVertexPosSize, GL_FLOAT, GL_FALSE, graphicsVertexDataSize_Byte, NULL);
glEnableVertexAttribArray(0);
glVertexAttribPointer(1, graphicsVertexTangentSize, GL_FLOAT, GL_FALSE, graphicsVertexDataSize_Byte, (GLvoid*)(sizeof(GLfloat) * graphicsVertexTangentOffset));
glEnableVertexAttribArray(1);
glVertexAttribPointer(2, graphicsVertexBitangentSize, GL_FLOAT, GL_FALSE, graphicsVertexDataSize_Byte, (GLvoid*)(sizeof(GLfloat) * graphicsVertexBitangentOffset));
glEnableVertexAttribArray(2);
glVertexAttribPointer(3, graphicsVertexNormalSize, GL_FLOAT, GL_FALSE, graphicsVertexDataSize_Byte, (GLvoid*)(sizeof(GLfloat) * graphicsVertexNormalOffset));
glEnableVertexAttribArray(3);
glVertexAttribPointer(4, graphicsVertexUVSize, GL_FLOAT, GL_FALSE, graphicsVertexDataSize_Byte, (GLvoid*)(sizeof(GLfloat) * graphicsVertexUVOffset));
glEnableVertexAttribArray(4);
glVertexAttribIPointer(5, graphicsVertexObjectIndexSize, GL_UNSIGNED_INT, graphicsVertexDataSize_Byte, (GLvoid*)(sizeof(GLfloat) * graphicsVertexObjectIndexOffset));
glEnableVertexAttribArray(5);


// Object SSBO
glGenBuffers(1, &(this->objectSSBO));
glBindBuffer(GL_SHADER_STORAGE_BUFFER, this->objectSSBO);
glBufferData(GL_SHADER_STORAGE_BUFFER, graphics2DMaximumSSBOSize_Byte, NULL, GL_STATIC_DRAW);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, this->objectSSBO);


Then I push everything into the substack (currently I don't account for substack overflow for testing I just do 1 object):



unsigned int objectCount = 0;
unsigned int vertexIndex = 0;
graphicsObject * bufferObject;

std::cout << " - Pushing Objects:" << std::endl;
for (unsigned int i = 0; i < this->objectQueue->size; i++) {
bufferObject = (graphicsObject *) this->objectQueue->lowestEntry->attachedObject;

std::cout << " Object " << bufferObject->objectID << " with " << bufferObject->meshSize << " geometries at " << graphics2DStartDepth + graphics2DDepthChange*(objectCount) << std::endl;
this->push2DObject( bufferObject, &objectCount, &vertexIndex);

graphicsLinkedListRotateBackward(this->objectQueue);

if (objectCount == graphics2DMaximumObjects) { // Assumes 1 object per push
std::cout << " - Flushing Substack (Substack full)..." << std::endl;
std::cout << " Object Count: " << objectCount << std::endl;
std::cout << " Vertex Count: " << vertexIndex << std::endl;

this->drawSubStack(vertexIndex); // Stages 1,2 and 3
this->drawToBuffer(); // Stage 4

std::cout << " - Continuing Pushing..." << std::endl;
objectCount = 0;
vertexIndex = 0;
}

}

if (vertexIndex > 0) {
std::cout << " - Flushing Substack (Remains):" << std::endl;
std::cout << " Object Count: " << objectCount << std::endl;
std::cout << " Vertex Count: " << vertexIndex << std::endl;

this->drawSubStack(vertexIndex); // Stages 1,2 and 3
this->drawToBuffer(); // Stage 4
}

Where each object pushed using:


unsigned int objectIndex = *objectCount;
objectCount[0]++;

// Load the data into buffer struct
this->objectDataBuffer.posVec[0] = object->posVec[0];
this->objectDataBuffer.posVec[1] = object->posVec[1];
this->objectDataBuffer.posVec[2] = 1.0 - graphics2DDepthChange*(*objectCount); // change if the triangles are outside the camera limit
this->objectDataBuffer.rotVec[0] = object->rotVec[0];
this->objectDataBuffer.rotVec[1] = object->rotVec[1];
this->objectDataBuffer.rotVec[2] = object->rotVec[2];
this->objectDataBuffer.scaleVec = *(object->scaleVec);

this->objectDataBuffer.textureLayer = object->textureLayer;

this->objectDataBuffer.drawEnable = object->drawEnable;
this->objectDataBuffer.colorMapEnable = object->colorMapEnable;
this->objectDataBuffer.colorVec[0] = object->colorVec[0];
this->objectDataBuffer.colorVec[1] = object->colorVec[1];
this->objectDataBuffer.colorVec[2] = object->colorVec[2];
this->objectDataBuffer.alphaScale = *(object->alphaScale);

this->objectDataBuffer.normalMapEnable = object->normalMapEnable;

this->objectDataBuffer.specularMapEnable = object->specularMapEnable;

this->objectDataBuffer.lightMapEnable = object->lightMapEnable;
this->objectDataBuffer.lightVec[0] = object->lightVec[0];
this->objectDataBuffer.lightVec[1] = object->lightVec[1];
this->objectDataBuffer.lightVec[2] = object->lightVec[2];
this->objectDataBuffer.lightScale = *(object->lightScale);

this->objectDataBuffer.controlEnable = object->controlEnable;
this->objectDataBuffer.controlColorVec[0] = object->controlColorVec[0];
this->objectDataBuffer.controlColorVec[1] = object->controlColorVec[1];
this->objectDataBuffer.controlColorVec[2] = object->controlColorVec[2];

// Load the data into OpenGL buffers
// Shader Storage Data
glBindBuffer(GL_SHADER_STORAGE_BUFFER, this->objectSSBO);
glBufferSubData(GL_SHADER_STORAGE_BUFFER, sizeof(objectVarsData)*objectIndex, sizeof(objectVarsData), &(this->objectDataBuffer));

// Vertex Array Data
glBindBuffer(GL_ARRAY_BUFFER, this->objectVBO);
for (int i = 0; i < object->meshSize; i++) {
glBufferSubData(GL_ARRAY_BUFFER, graphicsVertexDataSize_Byte * (*vertexIndex), graphicsVertexMeshDataSize_Byte, &object->meshPoints[(*vertexIndex) * graphicsVertexMeshDataSize]);
glBufferSubData(GL_ARRAY_BUFFER, graphicsVertexDataSize_Byte * (*vertexIndex) + graphicsVertexMeshDataSize_Byte, graphicsVertexUVSize_Byte, &object->uvPoints[(*vertexIndex) * graphicsVertexUVSize]);
glBufferSubData(GL_ARRAY_BUFFER, graphicsVertexDataSize_Byte * (*vertexIndex) + graphicsVertexMeshDataSize_Byte + graphicsVertexUVSize_Byte, graphicsVertexObjectIndexSize_Byte, &objectIndex);

vertexIndex[0]++;

glBufferSubData(GL_ARRAY_BUFFER, graphicsVertexDataSize_Byte * (*vertexIndex), graphicsVertexMeshDataSize_Byte, &object->meshPoints[(*vertexIndex) * graphicsVertexMeshDataSize]);
glBufferSubData(GL_ARRAY_BUFFER, graphicsVertexDataSize_Byte * (*vertexIndex) + graphicsVertexMeshDataSize_Byte, graphicsVertexUVSize_Byte, &object->uvPoints[(*vertexIndex) * graphicsVertexUVSize]);
glBufferSubData(GL_ARRAY_BUFFER, graphicsVertexDataSize_Byte * (*vertexIndex) + graphicsVertexMeshDataSize_Byte + graphicsVertexUVSize_Byte, graphicsVertexObjectIndexSize_Byte, &objectIndex);

vertexIndex[0]++;

glBufferSubData(GL_ARRAY_BUFFER, graphicsVertexDataSize_Byte * (*vertexIndex), graphicsVertexMeshDataSize_Byte, &object->meshPoints[(*vertexIndex) * graphicsVertexMeshDataSize]);
glBufferSubData(GL_ARRAY_BUFFER, graphicsVertexDataSize_Byte * (*vertexIndex) + graphicsVertexMeshDataSize_Byte, graphicsVertexUVSize_Byte, &object->uvPoints[(*vertexIndex) * graphicsVertexUVSize]);
glBufferSubData(GL_ARRAY_BUFFER, graphicsVertexDataSize_Byte * (*vertexIndex) + graphicsVertexMeshDataSize_Byte + graphicsVertexUVSize_Byte, graphicsVertexObjectIndexSize_Byte, &objectIndex);

vertexIndex[0]++;

if ((*vertexIndex / 3) >= graphics2DMaximumTriangles) {
std::cout << " Object Count: " << objectCount << std::endl;
std::cout << " Vertex Count: " << vertexIndex << std::endl;

this->drawSubStack((*vertexIndex)); // Stages 1,2 and 3
this->drawToBuffer(); // Stage 4

(*objectCount) = 0;
(*vertexIndex) = 0;
}
}

Finally, I draw the objects:


glBindFramebuffer(GL_FRAMEBUFFER, this->subSceneFBO1);
glClearColor(0.0f, 0.0f, 0.0f, 0.0f);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

glUseProgram(this->subSceneShaderArray[0]);

glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, this->textureAsset->colorMapID);
glUniform1i(this->stage1ColorMapLocation, 0);

glActiveTexture(GL_TEXTURE1);
glBindTexture(GL_TEXTURE_2D, this->textureAsset->normalMapID);
glUniform1i(this->stage1NormalMapLocation, 1);

glActiveTexture(GL_TEXTURE2);
glBindTexture(GL_TEXTURE_2D, this->textureAsset->specularMapID);
glUniform1i(this->stage1SpecularMapLocation, 2);

glActiveTexture(GL_TEXTURE3);
glBindTexture(GL_TEXTURE_2D, this->textureAsset->lightMapID);
glUniform1i(this->stage1LightMapLocation, 3);

glEnable(GL_DEPTH_TEST);
glDepthMask(GL_TRUE);
glDisable(GL_BLEND);

// Binding SSBO
glBindBuffer(GL_SHADER_STORAGE_BUFFER, this->objectSSBO);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, this->objectSSBO);

// Binding VBO
glBindVertexArray(this->objectVAO);
glBindBuffer(GL_ARRAY_BUFFER, this->objectVBO);
glDrawArrays(GL_TRIANGLES, 0, vertexIndex*graphicsVertexDataSize);

Vertex shader:


// shadertype=glsl
#version 450

layout (location = 0) in vec3 vertexPos;
layout (location = 1) in vec3 tangent;
layout (location = 2) in vec3 bitangent;
layout (location = 3) in vec3 normal;
layout (location = 4) in vec2 texCoord;
layout (location = 5) in uint objectIndex;

layout(std140, binding = 0) uniform cameraVars
{
mat4 projectionMat; // 64 bytes
mat4 cameraMat; // 128 bytes
};

struct objectVarsData {
vec3 posVec;
vec3 rotVec;
vec3 scaleVec;

int textureLayer;

int drawEnable;
int colorMapEnable;
vec3 colorVec;
float alphaScale;

int normalMapEnable;

int specularMapEnable;

int lightMapEnable;
vec3 lightVec;
float lightScale;

int controlEnable;
highp vec3 controlColorVec;
};

layout(std430, binding = 1) buffer objectVars
{
objectVarsData objectVarsArray[10000];
};

out vec2 tex_coord;
out mat3 normal_tran_mat;
out float object_depth;
flat out uint object_index;

///////////////////////////////////////////////////////////////////////////////////////////////////////////
void main() {
object_index = objectIndex;

objectVarsData objectVarsDataBuffer = objectVarsArray[object_index];

vec3 objectPosVec = objectVarsDataBuffer.posVec;
vec3 objectRotVec = objectVarsDataBuffer.rotVec;
vec3 objectScaleVec = objectVarsDataBuffer.scaleVec;

vec3 vertexTangent = tangent;
vec3 vertexBitangent = bitangent;
vec3 vertexNormal = normal;

// WORLD SPACE MATRIX CREATION
// Position
mat4 objectPosMat = mat4(1.0);
objectPosMat[3].xyz = vec3(objectPosVec.x, -objectPosVec.y, objectPosVec.z);

// Rotation X
mat4 objectRotXMat = mat4(1.0);
objectRotXMat[1] = vec4(0.0, cos(objectRotVec[0]), sin(objectRotVec[0]), 0.0);
objectRotXMat[2] = vec4(0.0, -sin(objectRotVec[0]), cos(objectRotVec[0]), 0.0);

// Rotation Y
mat4 objectRotYMat = mat4(1.0);
objectRotYMat[0] = vec4(cos(objectRotVec[1]), 0.0, -sin(objectRotVec[1]), 0.0);
objectRotYMat[2] = vec4(sin(objectRotVec[1]), 0.0, cos(objectRotVec[1]), 0.0);

// Rotation Z
mat4 objectRotZMat = mat4(1.0);
objectRotZMat[0] = vec4( cos(objectRotVec[2]), sin(objectRotVec[2]), 0.0, 0.0);
objectRotZMat[1] = vec4(-sin(objectRotVec[2]), cos(objectRotVec[2]), 0.0, 0.0);

// TOTAL TRANSFORMATION MATRIX
mat4 transformMat = cameraMat*objectPosMat*objectRotZMat*objectRotYMat *objectRotXMat; //cameraMat

// TRANSFORMING OBJECT
tex_coord = texCoord;

object_depth = 1.0 - objectPosVec[2];

vec4 bufferVec1 = normalize(transformMat*vec4(normalize(vertexTangen t), 0.0));
vec4 bufferVec2 = normalize(transformMat*vec4(normalize(vertexBitang ent), 0.0));
vec4 bufferVec3 = normalize(transformMat*vec4(normalize(vertexNormal ), 0.0));
normal_tran_mat = mat3(bufferVec1.xyz, bufferVec2.xyz, bufferVec3.xyz);

vec3 vertexPosBuffer = vertexPos;
vertexPosBuffer.x = vertexPos.x*objectScaleVec.x;
vertexPosBuffer.y = vertexPos.y*objectScaleVec.y;
vertexPosBuffer.z = vertexPos.z*objectScaleVec.z;

gl_Position = projectionMat*transformMat*vec4(vertexPosBuffer, 1.0); //projectionMat
}

Fragment shader:


// shadertype=glsl
#version 450

uniform sampler2D colorMap;
uniform sampler2D normalMap;
uniform sampler2D specularMap;
uniform sampler2D lightMap;

struct objectVarsData {
vec3 posVec;
vec3 rotVec;
vec3 scaleVec;

int textureLayer;

int drawEnable;
int colorMapEnable;
vec3 colorVec;
float alphaScale;

int normalMapEnable;

int specularMapEnable;

int lightMapEnable;
vec3 lightVec;
float lightScale;

int controlEnable;
highp vec3 controlColorVec;
};

layout(std430, binding = 1) buffer objectVars
{
objectVarsData objectVarsArray[10000];
};

in vec2 tex_coord;
in mat3 normal_tran_mat;
in float object_depth;
flat in uint object_index;

layout(location = 0) out vec4 stage1ColorMap;
layout(location = 1) out vec4 stage1NormalMap;
layout(location = 2) out vec4 stage1SpecularMap;
layout(location = 3) out vec4 stage1LightMap;
layout(location = 4) out vec4 stage1ControlMap;

///////////////////////////////////////////////////////////////////////////////////////////////////////////
void main() {

objectVarsData objectVarsDataBuffer = objectVarsArray[object_index];

int textureLayer = objectVarsDataBuffer.textureLayer;
int drawEnable = objectVarsDataBuffer.drawEnable;
int colorMapEnable = objectVarsDataBuffer.colorMapEnable;
vec3 colorVec = objectVarsDataBuffer.colorVec;
float alphaScale = clamp(objectVarsDataBuffer.alphaScale, 0.0, 1.0);
int normalMapEnable = objectVarsDataBuffer.normalMapEnable;
int specularMapEnable = objectVarsDataBuffer.specularMapEnable;
int lightMapEnable = objectVarsDataBuffer.lightMapEnable;
vec3 lightColorVec = objectVarsDataBuffer.lightVec;
float lightScale = objectVarsDataBuffer.lightScale;
int controlEnable = objectVarsDataBuffer.controlEnable;
highp vec3 controlColorVec = objectVarsDataBuffer.controlColorVec;

vec4 textureBuffer = texture(colorMap, tex_coord);
vec4 normalBuffer = texture(normalMap, tex_coord);
vec4 specularBuffer = texture(specularMap, tex_coord);
vec4 lightBuffer = texture(lightMap, tex_coord);

float fragmentAlpha = alphaScale*textureBuffer.a;
fragmentAlpha = clamp(fragmentAlpha, 0.0, 1.0);

if (fragmentAlpha == 1.0) {

// CONTROL MAP controlColorVec
if (controlEnable == 1) {
stage1ControlMap = vec4(controlColorVec, 1.0);
} else {
stage1ControlMap = vec4(0.0, 0.0, 0.0, 0.0);
}

// IMAGE MAPS
if (drawEnable == 1) {

if (colorMapEnable == 0) {
stage1ColorMap = vec4(colorVec, 1.0);
} else {
stage1ColorMap = vec4(textureBuffer.xyz, 1.0);
}

if (normalMapEnable == 0) {
stage1NormalMap = vec4(0.0, 0.0, 0.0, 1.0);
} else {
vec3 normalFromMap = 2.0*normalBuffer.xyz - vec3(1.0, 1.0, 1.0);
stage1NormalMap = vec4((normalize(normal_tran_mat*normalFromMap) + 1.0) / 2.0, 1.0);
}

if (specularMapEnable == 0) {
stage1SpecularMap = vec4(0.0, 0.0, 0.0, 1.0);
} else {
stage1SpecularMap = vec4(vec3(specularBuffer.x),1.0);
}

if (lightMapEnable == 0) {
stage1LightMap = vec4(0.0, 0.0, 0.0, 1.0);
} else {
stage1LightMap = vec4(lightBuffer.xyz,1.0);
}
} else {
stage1ColorMap = vec4(0.0, 0.0, 0.0, 0.0);
stage1NormalMap = vec4(0.0, 0.0, 0.0, 0.0);
stage1SpecularMap = vec4(0.0, 0.0, 0.0, 0.0);
stage1LightMap = vec4(0.0, 0.0, 0.0, 0.0);
}

gl_FragDepth = object_depth;
} else {
discard;
}
}

These are the only parts of the code that were changed, the rest of my code was unchanged from previous render version and well tested.

Any suggestions on debugging techniques for SSBOs would be awesome.

Thanks

CaptainSnugglebottom
01-28-2018, 06:32 PM
It appears I made a mistake in the code above, this is how I upload the data to the VBO:


unsigned int objectIndex = *objectCount;
objectCount[0]++;

// Load the data into buffer struct
this->objectDataBuffer.posVec[0] = object->posVec[0];
this->objectDataBuffer.posVec[1] = object->posVec[1];
this->objectDataBuffer.posVec[2] = 1.0 - graphics2DDepthChange*(*objectCount); // change if the triangles are outside the camera limit
this->objectDataBuffer.rotVec[0] = object->rotVec[0];
this->objectDataBuffer.rotVec[1] = object->rotVec[1];
this->objectDataBuffer.rotVec[2] = object->rotVec[2];
this->objectDataBuffer.scaleVec = *(object->scaleVec);

this->objectDataBuffer.textureLayer = object->textureLayer;

this->objectDataBuffer.drawEnable = object->drawEnable;
this->objectDataBuffer.colorMapEnable = object->colorMapEnable;
this->objectDataBuffer.colorVec[0] = object->colorVec[0];
this->objectDataBuffer.colorVec[1] = object->colorVec[1];
this->objectDataBuffer.colorVec[2] = object->colorVec[2];
this->objectDataBuffer.alphaScale = *(object->alphaScale);

this->objectDataBuffer.normalMapEnable = object->normalMapEnable;

this->objectDataBuffer.specularMapEnable = object->specularMapEnable;

this->objectDataBuffer.lightMapEnable = object->lightMapEnable;
this->objectDataBuffer.lightVec[0] = object->lightVec[0];
this->objectDataBuffer.lightVec[1] = object->lightVec[1];
this->objectDataBuffer.lightVec[2] = object->lightVec[2];
this->objectDataBuffer.lightScale = *(object->lightScale);

this->objectDataBuffer.controlEnable = object->controlEnable;
this->objectDataBuffer.controlColorVec[0] = object->controlColorVec[0];
this->objectDataBuffer.controlColorVec[1] = object->controlColorVec[1];
this->objectDataBuffer.controlColorVec[2] = object->controlColorVec[2];

// Load the data into OpenGL buffers
// Shader Storage Data
glBindBuffer(GL_SHADER_STORAGE_BUFFER, this->objectSSBO);
glBufferSubData(GL_SHADER_STORAGE_BUFFER, sizeof(objectVarsData)*objectIndex, sizeof(objectVarsData), &(this->objectDataBuffer));

// Vertex Array Data
glBindVertexArray(this->subSceneVAO);
glBindBuffer(GL_ARRAY_BUFFER, this->objectVBO);
for (int i = 0; i < (object->meshSize)*3; i++) {
// Vertex Data
glBufferSubData(GL_ARRAY_BUFFER, graphicsVertexDataSize_Byte * (*vertexIndex) + graphicsVertexPosOffset_Byte , graphicsVertexMeshDataSize_Byte , (const GLvoid *) &(object->meshPoints[(*vertexIndex) * graphicsVertexMeshDataSize]));

// UV Vector
glBufferSubData(GL_ARRAY_BUFFER, graphicsVertexDataSize_Byte * (*vertexIndex) + graphicsVertexUVOffset_Byte , graphicsVertexUVSize_Byte , (const GLvoid *) &(object->uvPoints[(*vertexIndex) * graphicsVertexUVSize]));

// Object ID
glBufferSubData(GL_ARRAY_BUFFER, graphicsVertexDataSize_Byte * (*vertexIndex) + graphicsVertexObjectIndexOffset_Byte , graphicsVertexObjectIndexSize_Byte , (const GLvoid *) objectCount);

vertexIndex[0]++;
}


Each vertex is supposed to have:
-vertex data (vec3)
-tangent data (vec3)
-bitangent data (vec3)
-normal data (vec3)
-UV data (vec2)
-Object Index (uint) - the last one will be used to access the data of the object that the specific drawn triangle belongs to (object data is kept in SSBO)

I did some debugging, starting by disabling SSBO and a whole bunch of shader things. It appears the culprit is my VBO. Instead of 2 test triangles being rendered, I get a mishapen single one. That disappears after the 2nd frame is drawn (then I get empty screen).

At this point, I was wondering whether the uint in the vertex data (that's otherwise full of floats) can be the issue.

Also, this is kinda hard to debug. I was wondering whether it is a good practice (not just from debugging point of view) to create an application buffer, that keeps all the data before uploading the entire thing to opengl? So having a single glBufferData for the enture buffer, instead of multiple glBufferSubDatas. Will it make any difference?

john_connor
01-29-2018, 02:55 AM
I was wondering whether it is a good practice (not just from debugging point of view) to create an application buffer, that keeps all the data before uploading the entire thing to opengl? So having a single glBufferData for the enture buffer, instead of multiple glBufferSubDatas. Will it make any difference?

yes, it will. read about buffer streaming, check out the commands:
--> gl{Named}BufferData() with the same size AND usage hint
--> glMap{Named}BufferRange() and the possible mapping flags

https://www.khronos.org/opengl/wiki/Buffer_Object_Streaming
https://www.khronos.org/registry/OpenGL-Refpages/gl4/html/glBufferData.xhtml
https://www.khronos.org/registry/OpenGL-Refpages/gl4/html/glMapBufferRange.xhtml

CaptainSnugglebottom
01-30-2018, 07:48 AM
Thanks.

But does that mean that using my current way (huge VBO + lots of glBufferSubData commands) is incorrect and will never work?

PS.

If I use different Vertex Array structures between different rendering stages, do I need to call glEnableVertexAttribArray and specifically glDisableVertexAttribArray between each stage?

What would happen if after stage 1 that uses 5 vertex attributes, I call stage 2 uses only 2 (with different shader of course). How would OpenGL treat unused 3 attributes?

mhagain
01-30-2018, 08:39 AM
But does that mean that using my current way (huge VBO + lots of glBufferSubData commands) is incorrect and will never work?

It should work but it's very unlikely to ever be efficient.

Graphics are like networking and hard disks - they can be very latency-sensitive, so lots of small updates will always compare unfavourably to very few large updates.

john_connor
01-30-2018, 12:53 PM
If I use different Vertex Array structures between different rendering stages, do I need to call glEnableVertexAttribArray and specifically glDisableVertexAttribArray between each stage?

thats what VAOs are about: storing the vertex attributes, their format / source buffer etc, if by "Vertex Array structures" you mean VAOs then: no, its sufficient to set up the VAOs once and just bind then when needed

CaptainSnugglebottom
01-30-2018, 04:37 PM
Thank you very much for your help. I will try changing my code to use 1 big VBO update.


Still wondering what I'm doing wrong tho. After I commented out most of my code, it just draws 1 single triangle that is not defined by any values I'm passing. Hope it doesn't come back to haunt my new implementation.

CaptainSnugglebottom
02-13-2018, 06:14 PM
Hello again.

I finally had a chance to replace my glBufferSubData with the glMapBuffer function, at least for the VBO, and that seems to be working well enough. However, the SSBO (that still uses glBufferSubData) is still having issues.

Test setup: 2 objects, different geometries, different position and rotation vectors.

Output:
- Different geometries (VBO finally working).
- SSBO for some reason only records the first object position vec3 value.
- The last object will always be rotated by some value. The rotation vector gets ignored except for 1 value (not sure which because of the constant rotation offset).
- I hardcoded the object index into the buffer to use either first or 2nd object's position, and as expected both objects only do stuff when using the 2nd object's positions. So I don't think VBO is the issue at all.
- I checked my C++ code, and the pointers in the glBufferSubData are proper. Each object's data is offset by 108 bytes, which corresponds to the struct (the rest of the SSBO related code is unchanged from above):


struct objectVarsData {
float posVec[3]; // 12 12
float rotVec[3]; // 12 24
float scaleVec[3]; // 12 36

int textureLayer; // 4 40

int drawEnable; // 4 44
int colorMapEnable; // 4 48
float colorVec[3]; // 12 60
float alphaScale; // 4 64

int normalMapEnable; // 4 68

int specularMapEnable; // 4 72

int lightMapEnable; // 4 76
float lightVec[3]; // 12 88
float lightScale; // 4 92

int controlEnable; // 4 96
float controlColorVec[3]; // 12 108
};

and in the shader:


struct objectVarsData {
vec3 posVec;
vec3 rotVec;
vec3 scaleVec;

int textureLayer;

int drawEnable;
int colorMapEnable;
vec3 colorVec;
float alphaScale;

int normalMapEnable;

int specularMapEnable;

int lightMapEnable;
vec3 lightVec;
float lightScale;

int controlEnable;
highp vec3 controlColorVec;
};

layout(std430, binding = 1) buffer objectVars
{
objectVarsData objectVarsArray[100];
};

What could cause this bizzare behavior? Some sources state that I should be using vec4 instead of vec3, but the fail to mention cases where std430 is used, which should take care of things. Could this be the issue?

Most importantly, since glMapBuffer is so much easier to use. I was wondering whether I can use glMapBuffer at the same time, for both VBO and SSBO. So it would look something like that:




glMapBuffer(GL_ARRAY_BUFFER, VBO)
glMapBuffer(GL_SHADER_STORAGE_BUFFER, SSBO)

for (each object) {
pushDataIntoSSBO()
pushGeometriesIntoVBO()
}

grawStuff()

glUnmapBuffer(GL_ARRAY_BUFFER)
glUnmapBuffer(GL_SHADER_STORAGE_BUFFER)



Or will this cause an issue?

GClements
02-13-2018, 11:00 PM
What could cause this bizzare behavior? Some sources state that I should be using vec4 instead of vec3, but the fail to mention cases where std430 is used, which should take care of things. Could this be the issue?

Yes. A vec3 is aligned as if it were a vec4 for both std140 and std430. The only difference between the two is that std140 rounds the alignment of all arrays and structures to that of a vec4, while std430 doesn't.

See 7.6.2.2 "Standard Uniform Block Layout" in the specification for the precise rules.

nimelord
02-14-2018, 02:18 AM
...Some sources state that I should be using vec4 instead of vec3...

I had the same problem some time ago.
I'd recommend you read carefully 7.6.2.2 of https://www.khronos.org/registry/OpenGL/specs/gl/glspec45.core.pdf .

Those complicated rules made me develop typed buffer, that can bind customized structures to GPU.

Dark Photon
02-14-2018, 05:03 AM
Most importantly, since glMapBuffer is so much easier to use. I was wondering whether I can use glMapBuffer at the same time, for both VBO and SSBO.

You should be able to, yes.

However, after you get your correctness bugs fixed and start looking at performance, I think you may find (as I have) that you get much better performance by avoiding use of glMapBuffer(). Unless you orphan the buffer before you call it, it is likely to cause internal synchronization in the GL driver (read: your CPU draw thread prevented from running while the GL driver and GPU "catches up" with the work you fed it thus far).

Better to look at using PERSISTENT+COHERENT mapped buffers, or UNSYNCHRONIZED buffer mapping in combination with buffer orphaning. For details, see Buffer Object Streaming (https://www.khronos.org/opengl/wiki/Buffer_Object_Streaming) in the wiki. At least familiarize yourself with how to orphan a buffer, and try it when you hit odd buffer update stalls.

CaptainSnugglebottom
02-14-2018, 03:05 PM
Thank you all for the answers. I have started repackaging my structs to avoid using vec3s.

The major issue was me using incorrect attribute buffering (buffered integer as a float for object indexing), fixing that coupled with redoing my structs fixed the issue and I can actually render things properly.

However, as I was adding stuff to the structs, I actually stumbled upon another issue that might have been preventing my program from working.

This is the struct that I used in C++ and its GLSL counterpart the last time the code worked:


struct graphicsObjectData {
float posVec[3]
float rotVec[3]; // 12 24
float sclVec[3]; // 12 36
int textureLayer; // 4 40
};


struct objectData {
vec2 posXY; // 8 8
vec2 posZ_rotX; // 8 16
vec2 rotYZ; // 8 24
vec2 sclXY; // 8 32
float sclZ; // 4 36
int textureLayer; // 4 40
};


And the data was uploaded using glBufferSubData:


glBindBuffer(GL_SHADER_STORAGE_BUFFER, this->objectSSBO);

objectDataBuffer.posVec[0] = object->posVec[0];
objectDataBuffer.posVec[1] = object->posVec[1];
objectDataBuffer.posVec[2] = 1.0 - graphics2DDepthChange*(*objectCount); // change if the triangles are outside the camera limit
objectDataBuffer.rotVec[0] = object->rotVec[0];
objectDataBuffer.rotVec[1] = object->rotVec[1];
objectDataBuffer.rotVec[2] = object->rotVec[2];
objectDataBuffer.sclVec[0] = object->scaleVec[0];
objectDataBuffer.sclVec[1] = object->scaleVec[1];
objectDataBuffer.sclVec[2] = object->scaleVec[2];
objectDataBuffer.textureLayer = object->textureLayer;

glBufferSubData(GL_SHADER_STORAGE_BUFFER, sizeof(graphicsObjectData)*objectIndex, sizeof(graphicsObjectData), &(objectDataBuffer));

... however the moment I add another integer to the struct, the problem of objects being messed up returns. The new structs from C++ and GLSL are:


struct graphicsObjectData {
float posVec[3]; // 12 12
float rotVec[3]; // 12 24
float sclVec[3]; // 12 36
int textureLayer; // 4 40
int drawEnable; // 4 44
};


struct objectData {
vec2 posXY; // 8 8
vec2 posZ_rotX; // 8 16
vec2 rotYZ; // 8 24
vec2 sclXY; // 8 32
float sclZ; // 4 36
int textureLayer; // 4 40
int drawEnable; // 4 44
};

Since this is the only difference between the working state and not, I think adding 2 integers in a row causes some sort of offset in the struct layout, if not in GLSL then definetely in C++. I was wondering whether there's a solution for this.

GClements
02-14-2018, 04:36 PM
Thank you all for the answers. I have started repackaging my structs to avoid using vec3s.
Unless the amount of memory involved is significant, I'd suggest the opposite: add padding fields to the C++ structure so that you can just use vec3s in the GLSL structures. Or move the int/float fields to occupy what would otherwise be padding.


Since this is the only difference between the working state and not, I think adding 2 integers in a row causes some sort of offset in the struct layout, if not in GLSL then definetely in C++. I was wondering whether there's a solution for this.
What is sizeof(graphicsObjectData)? On a 64-bit system, it's conceivable that C++ is rounding the size to a multiple of 8 (implementations are free to add any amount of padding anywhere other than before the first member).

CaptainSnugglebottom
02-14-2018, 05:18 PM
Unless the amount of memory involved is significant, I'd suggest the opposite: add padding fields to the C++ structure so that you can just use vec3s in the GLSL structures. Or move the int/float fields to occupy what would otherwise be padding.

What is sizeof(graphicsObjectData)? On a 64-bit system, it's conceivable that C++ is rounding the size to a multiple of 8 (implementations are free to add any amount of padding anywhere other than before the first member).

I actually realized what the problem was, based on this Stackoverflow page (https://stackoverflow.com/questions/29531237/memory-allocation-with-std430-qualifier). Not the first time GLSL weird packaging rules caused issues. I think I'm starting to dislike it.

sizeof(graphicsObjectData) is actually what it's supposed to be, right now its 44 bytes, at least with Visual Studio. GCC might be doing it differently.

Right now I decided to separate the floats so I define each float component separately, but I was wondering whether vec3(posX, posY, posZ) actually takes away anything from the performance.

I will give padding a go tho just for practice. I am planning to draw up to 10 million objects and even so I might lose like a few mebibytes, which shouldn't be a problem for SSBO (which is why I decided to use it).

Edit: On the other hand, if I have 10+ million extra bytes to send to the GPU due to padding, it might slow things down quiet a bit.

GClements
02-14-2018, 08:21 PM
I actually realized what the problem was, based on this Stackoverflow page (https://stackoverflow.com/questions/29531237/memory-allocation-with-std430-qualifier).
I'm not sure why I overlooked that in my previous post. If a structure contains vec2s, the structure itself needs to be aligned to a multiple of 8, which means that it's size will be a multiple of 8.


Not the first time GLSL weird packaging rules caused issues. I think I'm starting to dislike it.
It really isn't that weird. GLSL vectors are first-class objects, so they need to be aligned to a multiple of their size (i.e. a vec2 needs 8-byte alignment). Note that SSE has the same requirement, so you should try to use 8-byte alignment on float arrays where practical (modern compilers will try to use SSE vectorisation where possible; if you want the code to work on non-SSE CPUs, you have to request that explicitly).



Right now I decided to separate the floats so I define each float component separately, but I was wondering whether vec3(posX, posY, posZ) actually takes away anything from the performance.

Depending upon the GPU architecture, it may improve it. On GPUs where everything is a vec4, putting all of the components in the same vec4 will be more efficient for operations which use all of the components. But I have no idea whether that's applicable to any GPU which supports SSBOs.


I am planning to draw up to 10 million objects.
In which case, packing may be worthwhile. But I'd suggest first trying reordering so that the scalar fields use the "spare" components.

CaptainSnugglebottom
02-16-2018, 09:40 AM
Hmm, well I finished the thing, and SSBO + VBO only provides a 2x performance increase over calling draw for each object separately. For 10000 sprites, with texturing and alpha channels, I get around 3 FPS (up from 1-1.5 FPS). Kind of a let down, lol.

I wonder how else I can change the pipeline to get a decent performance.

Would instancing be possible for different geometries within the same draw call? So that, I make objects with their own VBOs (VAO is the same for all objects) and then pass the VBO pointers, instead of going through each object and loading data into a single big VBO?

Alfonse Reinheart
02-16-2018, 10:32 AM
For 10000 sprites, with texturing and alpha channels, I get around 3 FPS (up from 1-1.5 FPS).

You can get better performance than that with immediate mode rendering. So clearly, you're doing something wrong. This would be performance I might expect if you rendered each sprite with a separate shader or something.

Odds are good you're hitting a software path somehow.

CaptainSnugglebottom
02-16-2018, 11:10 AM
I thought the performance remained low due to me re-writing the SSBO/VBO with the same object data that remains constant. If I find a way to avoid that, the performance would improve. Right now I am looking at multi-threading SSBO/VBO filling. I can have both SSBO and VBO mapped at the same time (to answer my earlier questions), so technically I can separate loading 10000 objects into loading 2500 objects in 4 threads. I think glMapBuffers will help preventing OpenGL throwing an out of context error.


Also I technically draw things twice, due to the nature of my renderer, and stage 4 requires its own FBO since OpenGL doesn't handle feedback.

Also I redraw things a couple of times after that, which I can remove completely if I use my brain and move things around.

But comparing to the 1:1 rendering, keeping all things the same, the performance should still be better. I am not sure where else the drop could be.

CaptainSnugglebottom
02-16-2018, 05:45 PM
Hmm, I think I know what you meant now. Guess I will try SubDating everything now. Can't make it worse, that's for sure.

2674

EDIT: The work load is now spread out a bit, but overall performance is just as poor. Weird. Also, apparently glBufferSubData is faster on the SSBO than VBO (VBO's takes 36%+).

Alfonse Reinheart
02-16-2018, 06:07 PM
Have you done any actual performance testing or profiling to determine where your bottleneck is? Because you seem to be assuming that uploading 10,000 sprites is your bottleneck, which seems decidedly unlikely. If you really can't transfer more than a few hundred kilobytes per second to your GPU, then your card has serious problems. It seems far more likely that you're screwing something else up.

CaptainSnugglebottom
02-16-2018, 08:19 PM
I did as much performance testing as Visual Studio allows me. It claims that most of my performance is gone due to nvoglv64.dll and gdi32.dll, which is everything OpenGL related. I would like to know what other ways I could do to find the issue.

Rendering is still a big part of it tho, if I disable texturing, it jumps from 2-3 FPS to 23-27 FPS for 10000 objects. I should also probably mention that I run it on a 2012 laptop.


If you really can't transfer more than a few hundred kilobytes per second to your GPU, then your card has serious problems

Actually, it's almost 4.7 megabytes. 108 bytes of object data 10000 objects for the SSBO, 60 bytes of vertex data for 6 vertexes for 10000 objects.

john_connor
02-17-2018, 06:39 AM
as far as i can tell you are using the mapped buffer pointer to update the buffer in a for-loop, which means the buffer is mapped the whole time. alternatively you can try to build a local "cpu-sided" buffer (std::vector, ::reserve(buffersize)), set the data in that buffer and then upload it somehow, either glBufferSubData() or glMapBufferRange() (or via buffer streaming if the previous data isnt relevant anymore)

Dark Photon
02-17-2018, 07:38 AM
Batch Rendering using SSBO

SSBO will contain an array of structs that contains all data for each object
struct objectVarsData {
float posVec[3];
float rotVec[3];
...


SSBO + VBO only provides a 2x performance increase over calling draw for each object separately. For 10000 sprites, with texturing and alpha channels, I get around 3 FPS (up from 1-1.5 FPS

...I thought the performance remained low due to me re-writing the SSBO/VBO with the same object data that remains constant.

...Also I technically draw things twice, due to the nature of my renderer, and stage 4 requires its own FBO since OpenGL doesn't handle feedback.

...Also I redraw things a couple of times after that, which I can remove completely if I use my brain and move things around.

This whole thread is wandering around in the weeds.

You took something that sounds really simple, you made it a lot more complex, and you still have very poor performance to show for it.

Rather than try to optimize your much-more-complex tech approach...

I'd suggest you ignore your tech approach for a second, pop back up to the top level, and tell us what you are trying to accomplish. What's the big picture? Are you just drawing a bunch of point sprites (quads) with texturing and alpha? Is it more complicated than that? If so, how? Then sketch out your original (non-SSBO/non-VBO/etc.) implementation for us (show some code snippets). Also, tell us what GPU/driver/OS you are targeting, the number of sprites you're aiming to render, and at what target frame time. You're more likely to get good performance in the end with this route.

I'd recommend that you first understand clearly why your original (non-SSBO/VBO/etc.) implementation is slow, and what you need to change (minimally) to remove its primary bottlenecks and net you good performance. Folks here can help you with that.


I did as much performance testing as Visual Studio allows me.
It claims that most of my performance is gone due to nvoglv64.dll and gdi32.dll, which is everything OpenGL related.

Ok, so you're GL driver (CPU) and/or GPU performance limited. Which means to get better performance, you need to change how you're using OpenGL to drive the GPU.

There are other ways to profile GPU-based apps than running the MSVS Profiler on them. For instance, having "feature toggles" in your app where you can switch on/off various pieces of your draw loop for debugging can be useful for isolating how much frame time each feature takes.


Would instancing be possible for different geometries within the same draw call?

Please explain what's different about the geometries. Do these sprites you're rendering have different numbers of vertices (e.g. != 4)?

CaptainSnugglebottom
02-17-2018, 12:08 PM
I'd suggest you ignore your tech approach for a second, pop back up to the top level, and tell us what you are trying to accomplish. What's the big picture? Are you just drawing a bunch of point sprites (quads) with texturing and alpha? Is it more complicated than that? If so, how? Then sketch out your original (non-SSBO/non-VBO/etc.) implementation for us (show some code snippets). Also, tell us what GPU/driver/OS you are targeting, the number of sprites you're aiming to render, and at what target frame time. You're more likely to get good performance in the end with this route.

For educational purposes, I am building a multi-purpose engine. The idea is to be able to support geometry (triangle based), lines, and points. Right now I am doing the 2D rendering pipeline, where I assume all triangle based shapes to be flat, and to be ordered in a way (since there's usually some sort of hierarchy to 2D graphics, most important objects on top). In both 2D and 3D pipelines, I will be using (already have, but it's disabled) the differed shading technique as a way to optimize shading operations. Since differed shading inherently does not work well with the transparent objects, I had to separate operations into 4 stages:

Stage 1: Render solids (alpha == 1) in FBO1
Stage 2: Do differed shading, save to FBO2
Stage 3: Render alphas, using pre-rendered depth buffer from stage 1 to discard all fragments covered by non-transparent objects. Each alpha fragment is rendered with shading applied.
Stage 4: Render the layer's output to SceneFBO

This is done for each layer, with results from each layer are rendered on top of each other in stage 4. Also for each objects I render control geometry to its own output, where each object has its own unique application-wide control value. After all layers are rendered, the mouse position is extracted to trigger flag in whatever object the mouse is pointing at.

I can't really show you the code snippets because I wouldn't know where to start. It's just about 5000 lines of object oriented code right now.

Most of the things I mentioned, I implemented by calling draw calls for each object. So right now I am trying to learn something new while fixing the performance issue. I am aiming at using Windows OS, but I try to use cross-platform libraries in case I need to use my thing on a Linux machine.


Ok, so you're GL driver (CPU) and/or GPU performance limited. Which means to get better performance, you need to change how you're using OpenGL to drive the GPU.

There are other ways to profile GPU-based apps than running the MSVS Profiler on them. For instance, having "feature toggles" in your app where you can switch on/off various pieces of your draw loop for debugging can be useful for isolating how much frame time each feature takes.

I absolutely understand that drawing 10000+ objects by constantly uploading object data into buffers is insanity. It has bad design written all over it, which is why I will be implementing batch rendering next.


Please explain what's different about the geometries. Do these sprites you're rendering have different numbers of vertices (e.g. != 4)?


Yes exactly.

Right now I am thinking about making prototypes of each object type that contain its own geometry data and VBO location for batch rendering. That way when I make an object, I can use instanced rendering for drawing data from the predefined VBO. That should remove any need to update the vertex data at all, which is 77% of the data I upload to the video card every frame right now.

john_connor
02-17-2018, 02:40 PM
i suggest you start reading about rendering techniques ("OpenGL Superbible", "OpenGL Programming Guide" and other books/articles by nvidia and so on)

Alfonse Reinheart
02-17-2018, 04:32 PM
I am building a multi-purpose engine.

It should be noted that "performance" and "multi-purpose" don't go together. Imposing limitations on your scene is what allows you to be able to make optimizations. The more options you give to the user, the fewer options you leave for optimization.


In both 2D and 3D pipelines, I will be using (already have, but it's disabled) the differed shading technique as a way to optimize shading operations.

... why would you need to use deferred shading for 2D rendering? I could understand needing deferred shading if you're rendering billboards or something, but most 2D sprite rendering doesn't even use lighting.


Right now I am thinking about making prototypes of each object type that contain its own geometry data and VBO location for batch rendering. That way when I make an object, I can use instanced rendering for drawing data from the predefined VBO. That should remove any need to update the vertex data at all, which is 77% of the data I upload to the video card every frame right now.

Until you have positively identified the bottleneck, you should not be making those kinds of decisions. After all, what good does it do to reduce your data uploads by 77% if data uploading is not what's causing your performance problem.

The best way to figure this out is to reduce everything down to just the OpenGL stuff. Rip out your entire engine (or just open up a new OpenGL project), and rebuild just the sequence of operations needed to produce the output. The best way to do that is to get an OpenGL trace tool, have it spit out a log of OpenGL commands, and then put those commands in your new application.

From there, start profiling. Use timer queries to figure out how long operations on the GPU are taking. Pull things out and see if it improves performance. Start figuring out what is causing your problem.

Only when you know what the problem is can you actually solve it.

CaptainSnugglebottom
02-17-2018, 07:35 PM
It should be noted that "performance" and "multi-purpose" don't go together. Imposing limitations on your scene is what allows you to be able to make optimizations. The more options you give to the user, the fewer options you leave for optimization.

I know that, but I was hoping to get something more than 3 FPS for something basic like 10k sprites. The question is how to achieve that, and that's why I wanted to try batch rendering.


... why would you need to use deferred shading for 2D rendering? I could understand needing deferred shading if you're rendering billboards or something, but most 2D sprite rendering doesn't even use lighting.

I must be nuts, but I want to try something not a lot of people do.


Until you have positively identified the bottleneck, you should not be making those kinds of decisions. After all, what good does it do to reduce your data uploads by 77% if data uploading is not what's causing your performance problem.

I played around with the "comment out" tool, and it appears that most of my performance loss is in the fragments shaders that use too many if statements. Rendering a plain color, with alpha pass disabled gives me 15+ FPS for 10000 objects. For 1000 objects, the render goes from 30 FPS to 130 if I disable everything. So the main reason for my slowdown is the uber shader, but due to the nature of what I want to do, I guess I cannot change that.

Still tho, it does not mean I should not be looking into other things. Drawing just 2000 triangles at 130 FPS (with all effects off) is still not enough.

Alfonse Reinheart
02-17-2018, 07:37 PM
Oh and stop reporting performance as "FPS". Performance is best measured in actual frame time.

CaptainSnugglebottom
02-18-2018, 10:09 PM
Hello again,


I have stumbled upon a Steam Game Dev conference (https://www.youtube.com/watch?v=-bCeNzgiJ8I) that featured a presentation on modern technique for vertex data streaming, in particular a method utilizes persistent buffers. I implemented the solution the presenter proposed, replacing my SSBO and VBO buffers with persistent ones.


Initialization:

// Object SSBO
glGenBuffers(1, &(this->objectSSBO));
glBindBuffer(GL_SHADER_STORAGE_BUFFER, this->objectSSBO);

glBufferStorage(GL_SHADER_STORAGE_BUFFER, graphics2DMaximumSSBOSize_Byte * 3, NULL, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);

this->objectSSBOAddrStart = (graphics2DObjectData *) glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, graphics2DMaximumSSBOSize_Byte * 3, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);

glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);

// Object VBO
glGenBuffers(1, &(this->objectVBO));
glBindBuffer(GL_ARRAY_BUFFER, this->objectVBO);

glGenVertexArrays(1, &(this->objectVAO));
glBindVertexArray(this->objectVAO);
glVertexAttribPointer(0, 3, GL_FLOAT, GL_FALSE, sizeof(graphics2DObjectVertexData), (GLvoid*)offsetof(graphics2DObjectVertexData, position));
glEnableVertexAttribArray(0);
glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, sizeof(graphics2DObjectVertexData), (GLvoid*)offsetof(graphics2DObjectVertexData, uvCoordinates));
glEnableVertexAttribArray(1);
glVertexAttribIPointer(2, 1, GL_UNSIGNED_INT, sizeof(graphics2DObjectVertexData), (GLvoid*)offsetof(graphics2DObjectVertexData, objectIndex));
glEnableVertexAttribArray(2);

glBufferStorage(GL_ARRAY_BUFFER, graphics2DMaximumVBOSize_Byte*3, NULL, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);

this->objectVBOAddrStart = (graphics2DObjectVertexData *) glMapBufferRange(GL_ARRAY_BUFFER, 0, graphics2DMaximumVBOSize_Byte * 3, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT);

glBindBuffer(GL_ARRAY_BUFFER, 0);

Synchronization:


// Waiting for buffer
GLenum waitStatus = GL_UNSIGNALED;
if (this->subSceneSync) {
while ((waitStatus != GL_ALREADY_SIGNALED) && (waitStatus != GL_CONDITION_SATISFIED))
{
waitStatus = glClientWaitSync(this->subSceneSync, GL_SYNC_FLUSH_COMMANDS_BIT, 1);
}
}

this->objectVBOAddr = this->objectVBOAddrStart + this->currentBuffer*graphics2DMaximumVBOSize_Byte;
this->objectSSBOAddr = this->objectSSBOAddrStart + this->currentBuffer*graphics2DMaximumSSBOSize_Byte;

/////////////////////////////////////
// FETCH AND RENDER HERE
/////////////////////////////////////

this->currentBuffer = (this->currentBuffer + 1) % 3;

// Locking the buffer
if (this->subSceneSync) glDeleteSync(this->subSceneSync);

this->subSceneSync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);


Rendering:


glBindFramebuffer(GL_FRAMEBUFFER, this->subSceneFBO1);
glClearColor(0.0f, 0.0f, 0.0f, 0.0f);
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

glUseProgram(graphics2DStage1ObjectShader);

glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, this->textureAsset->colorMapID);
glUniform1i(graphics2DStage1ObjectColorMapLocation , 0);

glActiveTexture(GL_TEXTURE1);
glBindTexture(GL_TEXTURE_2D, this->textureAsset->normalMapID);
glUniform1i(graphics2DStage1ObjectNormalMapLocatio n, 1);

glActiveTexture(GL_TEXTURE2);
glBindTexture(GL_TEXTURE_2D, this->textureAsset->specularMapID);
glUniform1i(graphics2DStage1ObjectSpecularMapLocat ion, 2);

glActiveTexture(GL_TEXTURE3);
glBindTexture(GL_TEXTURE_2D, this->textureAsset->lightMapID);
glUniform1i(graphics2DStage1ObjectLightMapLocation , 3);

glEnable(GL_DEPTH_TEST);
glDepthMask(GL_TRUE);
glDisable(GL_BLEND);

// Binding SSBO
glBindBuffer(GL_SHADER_STORAGE_BUFFER, this->objectSSBO);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, this->objectSSBO);

// Binding VBO
glBindVertexArray(this->objectVAO);
glBindBuffer(GL_ARRAY_BUFFER, this->objectVBO);
glDrawArrays(GL_TRIANGLES, graphics2DMaximumVerteces*this->currentBuffer, vertexIndex);



These are the only major changes from the last working version of my thing.

However nvoglv64.dll crashes during SSBO data filling for the very first object. Addresses seem to be good, all the buffer switching is proper as well. My video card does support the ARB_BUFFER_STORAGE extension. Is there anything else I can check to ensure the working order.

The program also crashes with just VBO being persistent, but it actually makes it past frame 1, so I don't think having 2 persistent buffers is an issue.

Suggestions are appreciated.