VBO/VP/AttribArrays Horribly slow on GFFx 5200Ultra

why is the following code so slow on a NVidia Geforce FX 5200 Ultra ?

I Get 44FPS in debug without VBO, 8FPS with VBO.

this pipeline is entirely VertexProgram and Generic Attribute Array Based.

void CGfxTargetW32GL::VtxBuf_Draw( CVtxBufferBase *pVBuf, int iNumIDX, U16 *pIDX, EPrimitiveType eTyp, void *pVBOR, bool bwire )
{
int iNum = pVBuf->GetNum();
if( iNum )
{
//glPushClientAttrib( GL_CLIENT_ALL_ATTRIB_BITS );

  if( IsPickState() )
  {
  	SetVertexShaderConstant( 27, GetObjID() );
  }
  
  EVtxStreamFormat eStrFmt = pVBuf->GetStreamFormat();
  int iStride = pVBuf->GetVtxSize();
  U8 *pVtxBase = (U8*)pVBuf->GetVertexPointer();

  ////////////////////////////////////////////////////////////////////
  // setup VBO or DL

  #if USEVBO 
  {
  	VtxBufH hPB = pVBuf->GetPBHandle();

  	if( hPB )
  	{
  		glBindBufferARB( GL_ARRAY_BUFFER_ARB, hPB );
  		pVtxBase = 0;
  	}
  	else if( (EVTXBUFFLAG_GFXRES==pVBuf->GetFlags()) )
  	{
  		// Create A VBO and copy data into it
  		U32 ubh = 0;
  		glGenBuffersARB( 1, (GLuint*) & ubh );
  		hPB = (VtxBufH) ubh;
  		pVBuf->SetPBHandle( hPB );
  		glBindBufferARB( GL_ARRAY_BUFFER_ARB, hPB );
  		GL_ERRORCHECK();
  		int iVBlen = pVBuf->GetVtxSize()*pVBuf->GetMax();
  		glBufferDataARB( GL_ARRAY_BUFFER_ARB, iVBlen, pVBuf->GetVertexPointer(), GL_STATIC_DRAW_ARB );
  		GL_ERRORCHECK();

  		int nParam_ArrayObjectSize = 0; 

  		glGetBufferParameterivARB( GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &nParam_ArrayObjectSize );

  		if( nParam_ArrayObjectSize <= 0 )
  		{
  			MessageBox(NULL,"glBufferDataARB failed to allocate any memory!",
  				"ERROR",MB_OK|MB_ICONEXCLAMATION);
  		}

  		glBindBufferARB( GL_ARRAY_BUFFER_ARB, 0 );
  		glBindBufferARB( GL_ARRAY_BUFFER_ARB, hPB );
  		pVtxBase = 0;
  	}
  	else
  	{
  		glBindBufferARB( GL_ARRAY_BUFFER_ARB, 0 );
  	}

  	GL_ERRORCHECK();

  	#if USEIBO
  	{
  		static map<U16*,VtxBufH> IdxBufferMap;

  		if( hPB )
  		{
  			VtxBufH hIB = MagSTXFindValFromKey( IdxBufferMap, pIDX, (VtxBufH) 0 );
  			
  			if( 0==hIB )
  			{
  				glGenBuffersARB( 1, &hIB );
  				glBindBufferARB( GL_ELEMENT_ARRAY_BUFFER_ARB, hIB );
  				glBufferDataARB( GL_ELEMENT_ARRAY_BUFFER_ARB, iNumIDX*sizeof(U16), pIDX, GL_STATIC_DRAW_ARB );
  				MagSTXMapInsert( IdxBufferMap, pIDX, hIB );
  				GL_ERRORCHECK();
  			}
  			else
  			{
  				glBindBufferARB( GL_ELEMENT_ARRAY_BUFFER_ARB, hIB );
  				GL_ERRORCHECK();
  			}
  			pIDX = 0;

  		}
  		
  	}
  	#endif

  }
  #endif

  ////////////////////////////////////////////////////////////////////
  // set stream format

  switch( eStrFmt )
  {
  	case EVTXSTREAMFMT_V12C4N6I2T8:
  	{	
  		glEnableVertexAttribArrayARB( 0 );
  		glEnableVertexAttribArrayARB( 1 );
  		glEnableVertexAttribArrayARB( 2 );
  		glEnableVertexAttribArrayARB( 3 );
  		glEnableVertexAttribArrayARB( 4 );
  		
  		//////////////////////////////////////////////////////////////////////
  		#if( _BUILD_LEVEL > 1 )
  		//////////////////////////////////////////////////////////////////////
  		int iRenderMode = CSystem::GetGlobalIntVariable( "iRenderMode" );
  		switch( iRenderMode )
  		{
  			case 3: // route Normals to color
  				glVertexAttribPointerARB( 0, 3, GL_FLOAT,			KVANRM_FALSE, iStride, (void*) & pVtxBase[0] );	// V12
  				glVertexAttribPointerARB( 2, 3, GL_SHORT,			KVANRM_TRUE,  iStride, (void*) & pVtxBase[16] );	// N6
  				glVertexAttribPointerARB( 3, 1, GL_UNSIGNED_SHORT,	KVANRM_FALSE, iStride, (void*) & pVtxBase[22] );	// I2
  				glVertexAttribPointerARB( 4, 2, GL_FLOAT,			KVANRM_FALSE, iStride, (void*) & pVtxBase[24] );	// T8
  				glVertexAttribPointerARB( 1, 3, GL_SHORT,			KVANRM_TRUE,  iStride, (void*) & pVtxBase[16] );	// N6
  				break;
  			case 2: // route UVs to color
  				glVertexAttribPointerARB( 0, 3, GL_FLOAT,			KVANRM_FALSE, iStride, (void*) & pVtxBase[0] );	// V12
  				glVertexAttribPointerARB( 2, 3, GL_SHORT,			KVANRM_TRUE,  iStride, (void*) & pVtxBase[16] );	// N6
  				glVertexAttribPointerARB( 3, 1, GL_UNSIGNED_SHORT,	KVANRM_FALSE, iStride, (void*) & pVtxBase[22] );	// I2
  				glVertexAttribPointerARB( 4, 2, GL_FLOAT,			KVANRM_FALSE, iStride, (void*) & pVtxBase[24] );	// T8
  				glVertexAttribPointerARB( 1, 2, GL_FLOAT,			KVANRM_FALSE, iStride, (void*) & pVtxBase[24] );	// T8
  				break;
  			case 0:
  			default:
  				glVertexAttribPointerARB( 0, 3, GL_FLOAT,			KVANRM_FALSE, iStride, (void*) & pVtxBase[0] );	// V12
  				glVertexAttribPointerARB( 1, 4, GL_UNSIGNED_BYTE,	KVANRM_TRUE,  iStride, (void*) & pVtxBase[12] );	// C4
  				glVertexAttribPointerARB( 2, 3, GL_SHORT,			KVANRM_TRUE,  iStride, (void*) & pVtxBase[16] );	// N6
  				glVertexAttribPointerARB( 3, 1, GL_UNSIGNED_SHORT,	KVANRM_FALSE, iStride, (void*) & pVtxBase[22] );	// I2
  				glVertexAttribPointerARB( 4, 2, GL_FLOAT,			KVANRM_FALSE, iStride, (void*) & pVtxBase[24] );	// T8
  				break;
  		}
  		//////////////////////////////////////////////////////////////////////
  		#else
  		//////////////////////////////////////////////////////////////////////
  		glVertexAttribPointerARB( 0, 3, GL_FLOAT,			KVANRM_FALSE, iStride, (void*) & pVtxBase[0] );	// V12
  		glVertexAttribPointerARB( 1, 4, GL_UNSIGNED_BYTE,	KVANRM_TRUE,  iStride, (void*) & pVtxBase[12] );	// C4
  		glVertexAttribPointerARB( 2, 3, GL_SHORT,			KVANRM_TRUE,  iStride, (void*) & pVtxBase[16] );	// N6
  		glVertexAttribPointerARB( 3, 1, GL_UNSIGNED_SHORT,	KVANRM_FALSE, iStride, (void*) & pVtxBase[22] );	// I2
  		glVertexAttribPointerARB( 4, 2, GL_FLOAT,			KVANRM_FALSE, iStride, (void*) & pVtxBase[24] );	// T8
  		//////////////////////////////////////////////////////////////////////
  		#endif
  		//////////////////////////////////////////////////////////////////////
  		GL_ERRORCHECK();

  		break;
  	}
  	case EVTXSTREAMFMT_V12C4T4:
  	{	glDisableVertexAttribArrayARB( 3 );
  		glDisableVertexAttribArrayARB( 4 );
  		glEnableVertexAttribArrayARB( 0 );
  		glEnableVertexAttribArrayARB( 1 );
  		glEnableVertexAttribArrayARB( 2 );
  		glVertexAttribPointerARB( 0, 3, GL_FLOAT,			KVANRM_FALSE, iStride, (void*) & pVtxBase[0] );	// V12
  		glVertexAttribPointerARB( 1, 4, GL_UNSIGNED_BYTE,	KVANRM_TRUE,  iStride, (void*) & pVtxBase[12] );	// C4
  		glVertexAttribPointerARB( 2, 2, GL_SHORT,			KVANRM_FALSE, iStride, (void*) & pVtxBase[16] );	// N6
  		GL_ERRORCHECK();
  		break;
  	}
  	case EVTXSTREAMFMT_V4C4:
  	{	glDisableVertexAttribArrayARB( 2 );
  		glDisableVertexAttribArrayARB( 3 );
  		glDisableVertexAttribArrayARB( 4 );
  		glEnableVertexAttribArrayARB( 0 );
  		glEnableVertexAttribArrayARB( 1 );
  		glVertexAttribPointerARB( 0, 2, GL_SHORT, KVANRM_FALSE, 8, (void*) & pVtxBase[0] );	// V4
  		glVertexAttribPointerARB( 1, 4, GL_UNSIGNED_BYTE, KVANRM_TRUE, 8, (void*) & pVtxBase[4] );	// C4
  		GL_ERRORCHECK();
  		break;
  	}
  	case EVTXSTREAMFMT_V4T4:
  	{	glDisableVertexAttribArrayARB( 2 );
  		glDisableVertexAttribArrayARB( 3 );
  		glDisableVertexAttribArrayARB( 4 );
  		glEnableVertexAttribArrayARB( 0 );
  		glEnableVertexAttribArrayARB( 1 );
  		glVertexAttribPointerARB( 0, 2, GL_SHORT, KVANRM_FALSE, 8, (void*) & pVtxBase[0] );	// V4
  		glVertexAttribPointerARB( 1, 2, GL_SHORT, KVANRM_FALSE, 8, (void*) & pVtxBase[4] );	// T4
  		GL_ERRORCHECK();
  		break;
  	}
  	default:
  		break;
  }

  ////////////////////////////////////////////////////////////////////
  // draw it

  int inumpasses = mpCurMaterial->GetNumPasses();
  for( int ipass=0; ipass<inumpasses; ipass++ )
  {
  	mpCurMaterial->SetupPass( ipass );

  	static bool lbwire = false;

  	if( iNumIDX ) switch( eTyp ) // Primitive / Indices Override
  	{
  		case EPRIM_LINES:
  			GL_ERRORCHECK();
  			glDrawElements( GL_LINES, iNumIDX, GL_UNSIGNED_SHORT, pIDX );
  			GL_ERRORCHECK();
  			break;
  		case EPRIM_TRIANGLES:
  			GL_ERRORCHECK();
  			glDrawElements( GL_TRIANGLES, iNumIDX, GL_UNSIGNED_SHORT, pIDX );
  			miTrianglesRendered += (iNumIDX/3);
  			GL_ERRORCHECK();
  			break;
  		case EPRIM_TRIANGLESTRIP:
  			GL_ERRORCHECK();
  			glDrawElements( GL_TRIANGLE_STRIP, iNumIDX, GL_UNSIGNED_SHORT, pIDX );
  			miTrianglesRendered += (iNumIDX-2);
  			GL_ERRORCHECK();
  			break;
  		default:
  			glDrawElements( GL_POINTS, iNumIDX, GL_UNSIGNED_SHORT, pIDX );
  			MagAssert( false );
  			break;
  	
  	}
  	else switch( pVBuf->GetPrimType() )
  	{	case EPRIM_LINES:
  			GL_ERRORCHECK();
  			glDrawArrays( GL_LINES, 0, iNum );
  			GL_ERRORCHECK();
  			break;
  		case EPRIM_TRIANGLES:
  			GL_ERRORCHECK();
  			glDrawArrays( GL_TRIANGLES, 0, iNum );
  			GL_ERRORCHECK();
  			break;
  		default:
  			glDrawArrays( GL_POINTS, 0, iNum );
  			MagAssert( false );
  			break;
  	}

  	PopRenderState();

  }	// for( int ipass=0; ipass<inumpasses; ipass++ )
  	//////////////////////////////////////////////////////

  //glPopClientAttrib();

  glBindBufferARB( GL_ARRAY_BUFFER_ARB, 0 );

} // if( iNum )

GL_ERRORCHECK();

}

[This message has been edited by tweakoz (edited 11-03-2003).]

[This message has been edited by tweakoz (edited 11-03-2003).]

I Forgot to mention the all the VBO’sare static (so far… ;> )

mtm

Slow compared to what? What res/depth? Full screen or window?

Originally posted by Elixer:
[b]Slow compared to what? What res/depth? Full screen or window?

[/b]

I assumed since I was posting to the advanced forum, that anyone else would (or should)assume I have already ruled out fillrate or other unrelated issues. (I would have posted to beginners forum otherwise ;> )

The comparison (44FPS vs 8FPS) is achieved merely by altering the USE_VBO define. That define is ONLY referenced in one place, which is the posted code fragment. everything else is identical, including the visual results.

There is only one path in this code responsible for the slowdown, in that path, eStrFmt is EVTXSTREAMFMT_V12C4N6I2T8 (skinned) and eTyp is EPRIM_TRIANGLESTRIP, and VBOS are active in that case.

I am 99% sure this is driver related (52.16),
but if anyone with VBO experience on Nvidia FX class cards knows anything about this, if Im perhaps not initializing something properly, please reply… ;>

mtm

Hmmm…I’m guessing you tried other driver versions? I havn’t messed with VBOs as much as I would have liked to by now, but is it possible to get the full source so we can have a go at it?

-SirKnight

You are using index/vertex data in one buffer, right(if I got it correctly - I have not much experiance reading c++ code)? Split them in this case. Thare is an issue in a driver.

Originally posted by Zengar:
[b] You are using index/vertex data in one buffer, right(if I got it correctly - I have not much experiance reading c++ code)? Split them in this case. Thare is an issue in a driver.

[/b]

GL’s discussion board likes to ‘unformat’ my text, i always have a problem with it, probably makes the c++ harder to read ;<

Im not using VBO’s for index buffers currently, Im using standard host memory for index buffers right now.

happy halloween everyone, im off to get drunk ;>

mtm

don’t spend your money on alcohol, but buy a new (faster) graphics card . Like me, I drink a lot and still use an gf4…

GL’s discussion board likes to ‘unformat’ my text, i always have a problem with it, probably makes the c++ harder to read ;<

Use the code tags to remedy that problem. [ code] code here [ /code]. Of course without the space after the first [.

-SirKnight

I will always have problem reading C++ code so it won’t help 8)

I tried to move to C++ some time ago but after two days I was sick of it and now I’m back with my old good Delphi.

Originally posted by Zengar:
I tried to move to C++ some time ago but after two days I was sick of it…

[irony]
Wow, two whole days. That’s almost too much. I can’t see how you managed…
[/irony]

Originally posted by tweakoz:
[b] I assumed since I was posting to the advanced forum, that anyone else would (or should)assume I have already ruled out fillrate or other unrelated issues. (I would have posted to beginners forum otherwise ;> )

The comparison (44FPS vs 8FPS) is achieved merely by altering the USE_VBO define. That define is ONLY referenced in one place, which is the posted code fragment. everything else is identical, including the visual results.

There is only one path in this code responsible for the slowdown, in that path, eStrFmt is EVTXSTREAMFMT_V12C4N6I2T8 (skinned) and eTyp is EPRIM_TRIANGLESTRIP, and VBOS are active in that case.

I am 99% sure this is driver related (52.16),
but if anyone with VBO experience on Nvidia FX class cards knows anything about this, if Im perhaps not initializing something properly, please reply… ;>

mtm[/b]

LOL… you can never tell these days, I see lots of posts from beginners in this forum, so we never know who we are talking to.

Those are the new force drivers right? Try one of the older dets, and see if it is the same. I would try now, but I switched to a 9600, and well, those drivers have issues also. With 2 lockups, 4 “recover” alerts, and 3 just black screens, makes me think I am dealing with the same old ATI rage 128 drivers. BTW, I tried cat 3.8, and then tried 3.7s. Those seem more stable, but again, they still have issues.

OOps, looks like a hijacked this thread. sorry… maybe I need a drink now

Originally posted by Elixer:
LOL… you can never tell these days, I see lots of posts from beginners in this forum, so we never know who we are talking to.

Yes, I can see there are still beginners posting alot in this forum, but I think the forum should still give me (or any joe poster) the benefit of the doubt, until proven otherwise… Once the poster has given enough info, It would be easy enough to determine if the question belongs in the beginners forum. Even then its not that big of deal to just ignore the question, or reccomend that it be asked in the beginners forum. Or perhaps a moderator could move the question, If the forum server software supports that.

It just gets old and takes up too much time to have to specify details that should be obvious. (Luckily our game shipped already and our crunch is over so I dont mind right now…) ;>

Those are the new force drivers right? Try one of the older dets, and see if it is the same. I would try now, but I switched to a 9600, and well, those drivers have issues also. With 2 lockups, 4 “recover” alerts, and 3 just black screens, makes me think I am dealing with the same old ATI rage 128 drivers. BTW, I tried cat 3.8, and then tried 3.7s. Those seem more stable, but again, they still have issues.

I actually wrote the VBO code several months ago with the 44/45 series detonators, now Im on 52.16, the results were the same. I have a 9500 pro but its in my linux box at home, maybe I will have to swap them again… Last time I checked the 9500pro didnt like VBO and it didnt like generic vertex attribs, actually that was the only reason I took that 9500 out of my work machine, otherwise I liked that card… this 5200 was more expensive and is way slower… but the drivers seem slightly more mature. It couldnt possibly be an AGP problem, If the geometry is resident, the AGP traffic should decrease, even the non VBO path had to pass index buffers over the AGP bus.

I also wrote DX8 and DX9 renderers for this engine, although their code is out of date. My engine’s architecture changes alot and I dont always update every system…, maybe I will have to try them out see if it fairs better.

I prefer GL, but occasionally DX seems to support certain features better (like resident VB’s) VBO’s still seem unusable to me …

So does this mean no one has seen actual problems with the source code I have posted ?

OOps, looks like a hijacked this thread. sorry… maybe I need a drink now

Im still paying for this weekends halloween partying. My head hurts…

mtm

[This message has been edited by tweakoz (edited 11-03-2003).]

[This message has been edited by tweakoz (edited 11-03-2003).]

So you can see if it is a driver issue, get this: http://www.codesampler.com/source/ogl_vertex_buffer_objects.zip

And running it…With VBO I get around 250fps, without (Vertex array) I get 30.
(this is just by running the program, and toggling VBO on/off with F1.)

I know it isn’t quite what you had in mind, but it may shed a crumb of info for you.

Originally posted by tweakoz:
why is the following code so slow on a NVidia Geforce FX 5200 Ultra ?

I had the same problem with GeforceTi4200.
It seems that the new 52.16 driver do same cruel optimization with the static data, but have a lot of problem with it if it’s changing frequently (one change in all frame)…
Use DYNAMIC_DRAW instead. Unfortunately this seems to be a simple vertex array implementation…

Strange, on that demo I get 45fps on both modes. VBO didn’t help at all. I have a GeForce FX 5600 Ultra, Force Ware drivers, AMD XP 2600+ and 512mb DDR w/ WinXP Pro.

-SirKnight

Ok…I turned vsync off and now I get 68 to 69. That’s also weird as my refresh is at 85Hz. Also again, VBO did nothing to help.

-SirKnight

Originally posted by SirKnight:
[b]Ok…I turned vsync off and now I get 68 to 69. That’s also weird as my refresh is at 85Hz. Also again, VBO did nothing to help.

-SirKnight[/b]

I forgot to say that I was using a 9600 card with cat 3.7 drivers on a 1900+ 512MB/win2k system. I guess the Nvidia drivers do have a bug since you can’t get above 68.

[This message has been edited by Elixer (edited 11-03-2003).]

I have a GF5900Ultra + Dets 52.16 Win XP 2000+, I get 30 fps with vertex arrays and 210 fps with VBO. The drivers work fine for me.

These are very interesting results. The 9600 beats the 5900U by a good amount and at the same time both of those cards benefit greatly with VBO. Yet my card does not. I guess there is a bug with the NV30 chipset and these new drivers. Strange because the NV35 really isn’t that much different than the NV30s, a little bit but not anything super significant.

Maybe I’ll try the older 4x.xx drivers and see what happens.

-SirKnight