PDA

View Full Version : NVIDIA VBO performance issue



macarter
07-20-2007, 09:54 AM
Our application performs well on Both ATI and NVIDIA GPU's until we use NVIDIA 90 series or later drivers. With the newer drivers the performance of our application is severely degraded.

For example consider a drawing loop using the following pattern:


{
glBindBuffer
glMapBuffer
glUnmapBuffer

for (i = 0; i < N; i++) glDrawRangeElements;
} When N < 3 with hints of GL_STATIC_DRAW and GL_DYNAMIC_DRAW, or N < 5 with GL_STREAM_DRAW, the vertex buffer object stays fixed in GPU memory, otherwise it is repeatedly shuffled between GPU and system memory :confused: . The 84.21 driver did not shuffle the VBO with any number of draw calls.

Since this problem has persisted for many driver releases I have come to the conclusion that very few applications make use of glMapBuffer in combination with drawing multiple batches from a single VBO, or developers have failed to note the loss of performance.

Obviously our application is not structured as simply as in the pattern described above. Use of glBufferSubData() is not appropriate for our application since the VBO is filled from another thread, lacking an OpenGL context, using the pointer returned by glMapBuffer. This is a normally a very effective method of allowing a second CPU core to participate in a drawing application without slowing the OpenGL drawing thread. This was anticipated and specifically allowed by the VBO extension.

Since our VBO is never filled from the drawing thread, I contend that GL_STATIC_DRAW is most appropriate hint for optimal drawing performance. The performance of the thread filling the VBO is of much less concern. Further, the driver should not let the use of glMapBuffer influence the optimization of the VBO memory placement since it can't know if the pointer is used in a drawing thread.

A bug reproduction test program follows. It performs well with the 84.21 driver and poorly with 90 series or later drivers.


//gcc -Wall -mno-cygwin -O2 -o vboN -I../include vboN.c -lglut32 -lopengl32

#include <stdio.h>
#include <windows.h>
#include <GL/gl.h>
#include <GL/glext.h>
#include <GL/glut.h>

#ifndef GL_ARB_vertex_buffer_object
#define GL_ARB_vertex_buffer_object

typedef INT_PTR GLintptrARB;
typedef INT_PTR GLsizeiptrARB;

#define GL_ARRAY_BUFFER_ARB 0x8892
#define GL_ELEMENT_ARRAY_BUFFER_ARB 0x8893
#define GL_STREAM_DRAW_ARB 0x88E0
#define GL_STATIC_DRAW_ARB 0x88E4
#define GL_DYNAMIC_DRAW_ARB 0x88E8
#define GL_WRITE_ONLY_ARB 0x88B9

typedef GLvoid (APIENTRY * PFNGLBINDBUFFERARBPROC) (GLenum target, GLuint buffer);
typedef GLvoid (APIENTRY * PFNGLGENBUFFERSARBPROC) (GLsizei n, GLuint *buffers);
typedef GLvoid (APIENTRY * PFNGLBUFFERDATAARBPROC) (GLenum target, GLsizeiptrARB size, const GLvoid *data, GLenum usage);
typedef GLvoid (APIENTRY * PFNGLBUFFERSUBDATAARBPROC) (GLenum target, GLintptrARB offset, GLsizeiptrARB size, const GLvoid *data);
typedef GLvoid * (APIENTRY * PFNGLMAPBUFFERARBPROC) (GLenum target, GLenum access);
typedef GLboolean (APIENTRY * PFNGLUNMAPBUFFERARBPROC) (GLenum target);

#endif /* GL_ARB_vertex_buffer_object*/

#define MAX(a, b) (((a)<(b))?(b):(a))
#define DIM(x) (sizeof(x)/sizeof(*(x)))

#define GLindex GLushort // or GLuint
#define GL_INDEX GL_UNSIGNED_SHORT // or GL_UNSIGNED_INT

#define POLY_VERTEX_MAX 400000
#define VBO_SIZE (POLY_VERTEX_MAX * sizeof(PolyVertexFormat))
#define INDEX_MAX (4 * POLY_VERTEX_MAX)
#define IVBO_SIZE (INDEX_MAX * sizeof(GLindex))

typedef struct
{
GLubyte color[4];
GLfloat normal[3];
GLfloat texture[2];
GLfloat vertex[3];
} PolyVertexFormat;

typedef struct
{
PolyVertexFormat *pvert; // pointer to the vertex buffer
GLindex *pindx; // pointer to the index buffer
GLuint name[2];// buffer names when using vertex buffer objects
} ElementBuffer;

static ElementBuffer ebuf;

/* openGL extension functions */

PFNGLDRAWRANGEELEMENTSEXTPROC glDrawRangeElements;
PFNGLBINDBUFFERARBPROC glBindBuffer;
PFNGLGENBUFFERSARBPROC glGenBuffers;
PFNGLBUFFERDATAARBPROC glBufferData;
PFNGLBUFFERSUBDATAARBPROC glBufferSubData;
PFNGLMAPBUFFERARBPROC glMapBuffer;
PFNGLUNMAPBUFFERARBPROC glUnmapBuffer;

static PolyVertexFormat green_triangle[2][3] = {
{{{0,255,0,255}, {0,0,-1}, {0,0}, {-1,1,2}},
{{0,255,0,255}, {0,0,-1}, {0,0}, {1,1,2}},
{{0,255,0,255}, {0,0,-1}, {0,0}, {0,-1,2}}},
{{{0,255,0,255}, {0,0,-1}, {0,0}, {-1,-1,2}},
{{0,255,0,255}, {0,0,-1}, {0,0}, {0,1,2}},
{{0,255,0,255}, {0,0,-1}, {0,0}, {1,-1,2}}}};
static GLindex tri_index[3] = {0, 1, 2};

static void glErrorLog(GLenum code, char *str)
{
int i;
static struct
{
GLenum code;
char *text;
} glErrorTranslate[] = {
{GL_INVALID_ENUM, "enum argument out of range"},
{GL_INVALID_VALUE, "Numeric argument out of range"},
{GL_INVALID_OPERATION, "Operation illegal in current state"},
{GL_STACK_OVERFLOW, "Command would cause a stack overflow"},
{GL_STACK_UNDERFLOW, "Command would cause a stack underflow"},
{GL_OUT_OF_MEMORY, "Not enough memory left to execute command"},
{GL_TABLE_TOO_LARGE, "The specified table is too large"}};

if (code == 0) return;

for (i = 0; i < sizeof(glErrorTranslate)/sizeof(glErrorTranslate[0]); i++)
if (glErrorTranslate[i].code == code)
{
printf("%s%s\n", str ? str:"", glErrorTranslate[i].text);
return;
}

printf("Unknown gl error code\n");
}

static GLboolean QueryExtension(char *extName) // extName in extensions string?
{
int len = strlen(extName);
const char *p, *s, *b;

for (b = p = glGetString(GL_EXTENSIONS); (s = strstr(p, extName)); p = s +1)
if ((s == b &amp;#0124;&amp;#0124; s[-1] == ' ') &amp;&amp; (s[len] == ' ' &amp;#0124;&amp;#0124; s[len] == 0))
return GL_TRUE; // properly delimited

return GL_FALSE;
}

static void scene_init(int IndexVBO)
{
typedef BOOL (WINAPI * PFNWGLSWAPINTERVALEXTPROC) (int interval);
PFNWGLSWAPINTERVALEXTPROC wglSwapInterval;

wglSwapInterval = (void*)wglGetProcAddress("wglSwapIntervalEXT");
wglSwapInterval(0); // Vsync mode off

if (QueryExtension("GL_ARB_vertex_buffer_object"))
{
glBindBuffer = (void*)wglGetProcAddress("glBindBufferARB");
glGenBuffers = (void*)wglGetProcAddress("glGenBuffersARB");
glBufferData = (void*)wglGetProcAddress("glBufferDataARB");
glBufferSubData = (void*)wglGetProcAddress("glBufferSubDataARB");
glMapBuffer = (void*)wglGetProcAddress("glMapBufferARB");
glUnmapBuffer = (void*)wglGetProcAddress("glUnmapBufferARB");
} else printf("Vertex buffer objects not supported\n");

glDrawRangeElements = (void*)wglGetProcAddress("glDrawRangeElements");

if (!glDrawRangeElements) printf("glDrawRangeElements not supported\n");

glGenBuffers(2, ebuf.name);
glBindBuffer(GL_ARRAY_BUFFER_ARB, ebuf.name[0]);
glBufferData(GL_ARRAY_BUFFER_ARB, VBO_SIZE, NULL, GL_STREAM_DRAW_ARB);
ebuf.pvert = glMapBuffer(GL_ARRAY_BUFFER_ARB, GL_WRITE_ONLY_ARB);
memcpy(ebuf.pvert, green_triangle[0], sizeof(green_triangle[0]));
glUnmapBuffer(GL_ARRAY_BUFFER_ARB);

if (IndexVBO)
{
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB, ebuf.name[1]);
glBufferData(GL_ELEMENT_ARRAY_BUFFER_ARB, IVBO_SIZE, NULL,
GL_STREAM_DRAW_ARB);
ebuf.pindx = glMapBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB,
GL_WRITE_ONLY_ARB);
memcpy(ebuf.pindx, tri_index, sizeof(tri_index));
glUnmapBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB);
}
else
{
ebuf.pindx = malloc(IVBO_SIZE);
memcpy(ebuf.pindx, tri_index, sizeof(tri_index));
}

glMatrixMode(GL_PROJECTION);
glFrustum(1, -1, 1, -1, 1, 1000);
glScalef(-1,-1,-1);
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
glEnableClientState(GL_COLOR_ARRAY);
glEnableClientState(GL_VERTEX_ARRAY);
glEnableClientState(GL_TEXTURE_COORD_ARRAY);
glEnableClientState(GL_NORMAL_ARRAY);
glClearColor(1,0,0,0);
}

static void setup_pointers(void *base)
{
glTexCoordPointer(DIM(((PolyVertexFormat*)0)->texture),
GL_FLOAT, sizeof(PolyVertexFormat),
base + (int)((PolyVertexFormat*)0)->texture);
glNormalPointer(GL_FLOAT, sizeof(PolyVertexFormat),
base + (int)((PolyVertexFormat*)0)->normal);
glColorPointer(DIM(((PolyVertexFormat*)0)->color),
GL_UNSIGNED_BYTE, sizeof(PolyVertexFormat),
base + (int)((PolyVertexFormat*)0)->color);
glVertexPointer(DIM(((PolyVertexFormat*)0)->vertex), GL_FLOAT,
sizeof(PolyVertexFormat),
base + (int)((PolyVertexFormat*)0)->vertex);
}

static void drawing_loop(void)
{
LARGE_INTEGER t0, t1, freq;
int i, IndexVBO = 0;

scene_init(IndexVBO); // one time initialization
QueryPerformanceFrequency(&amp;freq); // counts per second

for (i = 0; i < 50; i++)
{
int j;

glBindBuffer(GL_ARRAY_BUFFER_ARB, ebuf.name[0]);
glMapBuffer(GL_ARRAY_BUFFER_ARB, GL_WRITE_ONLY_ARB);
glUnmapBuffer(GL_ARRAY_BUFFER_ARB);

if (IndexVBO)
{
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB, ebuf.name[1]);
glMapBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB, GL_WRITE_ONLY_ARB);
glUnmapBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB);
}

QueryPerformanceCounter(&amp;t0);
glClear(GL_COLOR_BUFFER_BIT);
setup_pointers(0);

// multiple draw calls stimulate the VBO problem
// three for GL_STATIC_DRAW_ARB and GL_DYNAMIC_DRAW_ARB
// and five for GL_STREAM_DRAW_ARB

for (j = 0; j < 5; j++)
glDrawRangeElements(GL_TRIANGLES, 0, 2, 3, GL_INDEX,
IndexVBO ? NULL: ebuf.pindx);

glFinish();
QueryPerformanceCounter(&amp;t1);
glutSwapBuffers();
printf("glDrawRangeElements %f milliseconds\n",
1000.0 * (t1.QuadPart - t0.QuadPart)/freq.QuadPart);
fflush(stdout);
}

glErrorLog(glGetError(), "drawing_loop() "); // any errors?

exit(0);
}

int main(int argc, char *argv[])
{
int window;

glutInit(&amp;argc, argv);
glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_ALPHA);
glutInitWindowSize(100,100);
glutInitWindowPosition(0,0);
window = glutCreateWindow("");
glutDisplayFunc(&amp;drawing_loop);
glutMainLoop();

return 0;
}