NVIDIA VBO performance issue

Our application performs well on Both ATI and NVIDIA GPU’s until we use NVIDIA 90 series or later drivers. With the newer drivers the performance of our application is severely degraded.

For example consider a drawing loop using the following pattern:

{
    glBindBuffer
    glMapBuffer
    glUnmapBuffer

    for (i = 0; i < N; i++) glDrawRangeElements;
} 

When N < 3 with hints of GL_STATIC_DRAW and GL_DYNAMIC_DRAW, or N < 5 with GL_STREAM_DRAW, the vertex buffer object stays fixed in GPU memory, otherwise it is repeatedly shuffled between GPU and system memory :confused: . The 84.21 driver did not shuffle the VBO with any number of draw calls.

Since this problem has persisted for many driver releases I have come to the conclusion that very few applications make use of glMapBuffer in combination with drawing multiple batches from a single VBO, or developers have failed to note the loss of performance.

Obviously our application is not structured as simply as in the pattern described above. Use of glBufferSubData() is not appropriate for our application since the VBO is filled from another thread, lacking an OpenGL context, using the pointer returned by glMapBuffer. This is a normally a very effective method of allowing a second CPU core to participate in a drawing application without slowing the OpenGL drawing thread. This was anticipated and specifically allowed by the VBO extension.

Since our VBO is never filled from the drawing thread, I contend that GL_STATIC_DRAW is most appropriate hint for optimal drawing performance. The performance of the thread filling the VBO is of much less concern. Further, the driver should not let the use of glMapBuffer influence the optimization of the VBO memory placement since it can’t know if the pointer is used in a drawing thread.

A bug reproduction test program follows. It performs well with the 84.21 driver and poorly with 90 series or later drivers.

//gcc -Wall -mno-cygwin -O2 -o vboN -I../include vboN.c  -lglut32 -lopengl32

#include <stdio.h>
#include <windows.h>
#include <GL/gl.h>
#include <GL/glext.h>
#include <GL/glut.h>

#ifndef GL_ARB_vertex_buffer_object
#define GL_ARB_vertex_buffer_object  

typedef INT_PTR GLintptrARB;
typedef INT_PTR GLsizeiptrARB;

#define GL_ARRAY_BUFFER_ARB                             0x8892
#define GL_ELEMENT_ARRAY_BUFFER_ARB                     0x8893
#define GL_STREAM_DRAW_ARB                              0x88E0
#define GL_STATIC_DRAW_ARB                              0x88E4
#define GL_DYNAMIC_DRAW_ARB                             0x88E8
#define GL_WRITE_ONLY_ARB                               0x88B9

typedef GLvoid      (APIENTRY * PFNGLBINDBUFFERARBPROC) (GLenum target, GLuint buffer);
typedef GLvoid      (APIENTRY * PFNGLGENBUFFERSARBPROC) (GLsizei n, GLuint *buffers);
typedef GLvoid      (APIENTRY * PFNGLBUFFERDATAARBPROC) (GLenum target, GLsizeiptrARB size, const GLvoid *data, GLenum usage);
typedef GLvoid      (APIENTRY * PFNGLBUFFERSUBDATAARBPROC) (GLenum target, GLintptrARB offset, GLsizeiptrARB size, const GLvoid *data);
typedef GLvoid *    (APIENTRY * PFNGLMAPBUFFERARBPROC) (GLenum target, GLenum access);
typedef GLboolean   (APIENTRY * PFNGLUNMAPBUFFERARBPROC) (GLenum target);

#endif /* GL_ARB_vertex_buffer_object*/

#define MAX(a, b)		(((a)<(b))?(b):(a))
#define DIM(x)			(sizeof(x)/sizeof(*(x)))

#define GLindex			GLushort		// or GLuint
#define GL_INDEX		GL_UNSIGNED_SHORT	// or GL_UNSIGNED_INT

#define POLY_VERTEX_MAX		400000
#define VBO_SIZE		(POLY_VERTEX_MAX * sizeof(PolyVertexFormat))
#define INDEX_MAX		(4 * POLY_VERTEX_MAX)
#define IVBO_SIZE		(INDEX_MAX * sizeof(GLindex))

typedef struct
{
    GLubyte	color[4];
    GLfloat	normal[3];
    GLfloat	texture[2];
    GLfloat	vertex[3];
} PolyVertexFormat;

typedef struct
{
    PolyVertexFormat	*pvert;	// pointer to the vertex buffer
    GLindex		*pindx;	// pointer to the index buffer
    GLuint		name[2];// buffer names when using vertex buffer objects
} ElementBuffer;

static ElementBuffer	ebuf;

/* openGL extension functions */

PFNGLDRAWRANGEELEMENTSEXTPROC		glDrawRangeElements;
PFNGLBINDBUFFERARBPROC			glBindBuffer;
PFNGLGENBUFFERSARBPROC			glGenBuffers;
PFNGLBUFFERDATAARBPROC			glBufferData;
PFNGLBUFFERSUBDATAARBPROC		glBufferSubData;
PFNGLMAPBUFFERARBPROC			glMapBuffer;
PFNGLUNMAPBUFFERARBPROC			glUnmapBuffer;

static PolyVertexFormat	green_triangle[2][3] = {
    {{{0,255,0,255}, {0,0,-1}, {0,0}, {-1,1,2}},
    {{0,255,0,255}, {0,0,-1}, {0,0}, {1,1,2}},
    {{0,255,0,255}, {0,0,-1}, {0,0}, {0,-1,2}}},
    {{{0,255,0,255}, {0,0,-1}, {0,0}, {-1,-1,2}},
    {{0,255,0,255}, {0,0,-1}, {0,0}, {0,1,2}},
    {{0,255,0,255}, {0,0,-1}, {0,0}, {1,-1,2}}}};
static GLindex 		tri_index[3] = {0, 1, 2};

static void glErrorLog(GLenum code, char *str)
{
    int i;
    static struct
    {
    	GLenum	code;
	char	*text;
    } glErrorTranslate[] = {
	{GL_INVALID_ENUM, "enum argument out of range"},
	{GL_INVALID_VALUE, "Numeric argument out of range"},
	{GL_INVALID_OPERATION, "Operation illegal in current state"},
	{GL_STACK_OVERFLOW, "Command would cause a stack overflow"},
	{GL_STACK_UNDERFLOW, "Command would cause a stack underflow"},
	{GL_OUT_OF_MEMORY, "Not enough memory left to execute command"},
	{GL_TABLE_TOO_LARGE, "The specified table is too large"}};

    if (code == 0) return;

    for (i = 0; i < sizeof(glErrorTranslate)/sizeof(glErrorTranslate[0]); i++)
        if (glErrorTranslate[i].code == code)
	{
	    printf("%s%s
", str ? str:"", glErrorTranslate[i].text);
	    return;
	}

    printf("Unknown gl error code
");
}

static GLboolean QueryExtension(char *extName) // extName in extensions string?
{
    int  len = strlen(extName);
    const char *p, *s, *b;

    for (b = p = glGetString(GL_EXTENSIONS); (s = strstr(p, extName)); p = s +1)
	if ((s == b &#0124;&#0124; s[-1] == ' ') && (s[len] == ' ' &#0124;&#0124; s[len] == 0))
	    return GL_TRUE;	// properly delimited

    return GL_FALSE;
}

static void	scene_init(int IndexVBO)
{
    typedef BOOL (WINAPI * PFNWGLSWAPINTERVALEXTPROC) (int interval);
    PFNWGLSWAPINTERVALEXTPROC	wglSwapInterval;

    wglSwapInterval = (void*)wglGetProcAddress("wglSwapIntervalEXT");
    wglSwapInterval(0);	// Vsync mode off

    if (QueryExtension("GL_ARB_vertex_buffer_object"))
    {
	glBindBuffer    = (void*)wglGetProcAddress("glBindBufferARB");
	glGenBuffers    = (void*)wglGetProcAddress("glGenBuffersARB");
	glBufferData    = (void*)wglGetProcAddress("glBufferDataARB");
	glBufferSubData = (void*)wglGetProcAddress("glBufferSubDataARB");
	glMapBuffer     = (void*)wglGetProcAddress("glMapBufferARB");
	glUnmapBuffer   = (void*)wglGetProcAddress("glUnmapBufferARB");
    } else printf("Vertex buffer objects not supported
");

    glDrawRangeElements = (void*)wglGetProcAddress("glDrawRangeElements");

    if (!glDrawRangeElements) printf("glDrawRangeElements not supported
");

    glGenBuffers(2, ebuf.name);
    glBindBuffer(GL_ARRAY_BUFFER_ARB, ebuf.name[0]);
    glBufferData(GL_ARRAY_BUFFER_ARB, VBO_SIZE, NULL, GL_STREAM_DRAW_ARB);
    ebuf.pvert = glMapBuffer(GL_ARRAY_BUFFER_ARB, GL_WRITE_ONLY_ARB);
    memcpy(ebuf.pvert, green_triangle[0], sizeof(green_triangle[0]));
    glUnmapBuffer(GL_ARRAY_BUFFER_ARB);

    if (IndexVBO)
    {
	glBindBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB, ebuf.name[1]);
	glBufferData(GL_ELEMENT_ARRAY_BUFFER_ARB, IVBO_SIZE, NULL,
		     GL_STREAM_DRAW_ARB);
	ebuf.pindx = glMapBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB,
				    GL_WRITE_ONLY_ARB);
	memcpy(ebuf.pindx, tri_index, sizeof(tri_index));
	glUnmapBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB);
    }
    else
    {
	ebuf.pindx = malloc(IVBO_SIZE);
	memcpy(ebuf.pindx, tri_index, sizeof(tri_index));
    }

    glMatrixMode(GL_PROJECTION);
    glFrustum(1, -1, 1, -1, 1, 1000);
    glScalef(-1,-1,-1);
    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();
    glEnableClientState(GL_COLOR_ARRAY);
    glEnableClientState(GL_VERTEX_ARRAY);
    glEnableClientState(GL_TEXTURE_COORD_ARRAY);
    glEnableClientState(GL_NORMAL_ARRAY);
    glClearColor(1,0,0,0);
}

static void setup_pointers(void *base)
{
    glTexCoordPointer(DIM(((PolyVertexFormat*)0)->texture),
		      GL_FLOAT, sizeof(PolyVertexFormat),
		      base + (int)((PolyVertexFormat*)0)->texture);
    glNormalPointer(GL_FLOAT, sizeof(PolyVertexFormat),
		       base + (int)((PolyVertexFormat*)0)->normal);
    glColorPointer(DIM(((PolyVertexFormat*)0)->color),
		   GL_UNSIGNED_BYTE, sizeof(PolyVertexFormat),
		   base + (int)((PolyVertexFormat*)0)->color);
    glVertexPointer(DIM(((PolyVertexFormat*)0)->vertex), GL_FLOAT,
		    sizeof(PolyVertexFormat),
		    base + (int)((PolyVertexFormat*)0)->vertex);
}

static void	drawing_loop(void)
{
    LARGE_INTEGER	t0, t1, freq;
    int			i, IndexVBO = 0;

    scene_init(IndexVBO);		// one time initialization
    QueryPerformanceFrequency(&freq);	// counts per second

    for (i = 0; i < 50; i++)
    {
	int  j;

	glBindBuffer(GL_ARRAY_BUFFER_ARB, ebuf.name[0]);
	glMapBuffer(GL_ARRAY_BUFFER_ARB, GL_WRITE_ONLY_ARB);
	glUnmapBuffer(GL_ARRAY_BUFFER_ARB);

	if (IndexVBO)
	{
	    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB, ebuf.name[1]);
	    glMapBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB, GL_WRITE_ONLY_ARB);
	    glUnmapBuffer(GL_ELEMENT_ARRAY_BUFFER_ARB);
	}

	QueryPerformanceCounter(&t0);
	glClear(GL_COLOR_BUFFER_BIT);
	setup_pointers(0);

	// multiple draw calls stimulate the VBO problem
	// three for GL_STATIC_DRAW_ARB and GL_DYNAMIC_DRAW_ARB
	// and five for GL_STREAM_DRAW_ARB

	for (j = 0; j < 5; j++)
	    glDrawRangeElements(GL_TRIANGLES, 0, 2, 3, GL_INDEX,
				IndexVBO ? NULL: ebuf.pindx);

	glFinish();
	QueryPerformanceCounter(&t1);
	glutSwapBuffers();
	printf("glDrawRangeElements %f milliseconds
",
		1000.0 * (t1.QuadPart - t0.QuadPart)/freq.QuadPart);
	fflush(stdout);
    }

    glErrorLog(glGetError(), "drawing_loop() ");	// any errors?

    exit(0);
}

int main(int argc, char *argv[])
{
    int window;

    glutInit(&argc, argv);
    glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_ALPHA);
    glutInitWindowSize(100,100);
    glutInitWindowPosition(0,0);
    window = glutCreateWindow("");
    glutDisplayFunc(&drawing_loop);
    glutMainLoop();

    return 0;
}