PDA

View Full Version : VBO/VP/AttribArrays Horribly slow on GFFx 5200Ultra



tweakoz
10-31-2003, 12:05 PM
why is the following code so slow on a NVidia Geforce FX 5200 Ultra ?

I Get 44FPS in debug without VBO, 8FPS with VBO.

this pipeline is entirely VertexProgram and Generic Attribute Array Based.



void CGfxTargetW32GL::VtxBuf_Draw( CVtxBufferBase *pVBuf, int iNumIDX, U16 *pIDX, EPrimitiveType eTyp, void *pVBOR, bool bwire )
{
int iNum = pVBuf->GetNum();
if( iNum )
{
//glPushClientAttrib( GL_CLIENT_ALL_ATTRIB_BITS );

if( IsPickState() )
{
SetVertexShaderConstant( 27, GetObjID() );
}

EVtxStreamFormat eStrFmt = pVBuf->GetStreamFormat();
int iStride = pVBuf->GetVtxSize();
U8 *pVtxBase = (U8*)pVBuf->GetVertexPointer();

////////////////////////////////////////////////////////////////////
// setup VBO or DL

#if USEVBO
{
VtxBufH hPB = pVBuf->GetPBHandle();

if( hPB )
{
glBindBufferARB( GL_ARRAY_BUFFER_ARB, hPB );
pVtxBase = 0;
}
else if( (EVTXBUFFLAG_GFXRES==pVBuf->GetFlags()) )
{
// Create A VBO and copy data into it
U32 ubh = 0;
glGenBuffersARB( 1, (GLuint*) & ubh );
hPB = (VtxBufH) ubh;
pVBuf->SetPBHandle( hPB );
glBindBufferARB( GL_ARRAY_BUFFER_ARB, hPB );
GL_ERRORCHECK();
int iVBlen = pVBuf->GetVtxSize()*pVBuf->GetMax();
glBufferDataARB( GL_ARRAY_BUFFER_ARB, iVBlen, pVBuf->GetVertexPointer(), GL_STATIC_DRAW_ARB );
GL_ERRORCHECK();

int nParam_ArrayObjectSize = 0;

glGetBufferParameterivARB( GL_ARRAY_BUFFER_ARB, GL_BUFFER_SIZE_ARB, &nParam_ArrayObjectSize );

if( nParam_ArrayObjectSize <= 0 )
{
MessageBox(NULL,"glBufferDataARB failed to allocate any memory!",
"ERROR",MB_OK|MB_ICONEXCLAMATION);
}

glBindBufferARB( GL_ARRAY_BUFFER_ARB, 0 );
glBindBufferARB( GL_ARRAY_BUFFER_ARB, hPB );
pVtxBase = 0;


}
else
{
glBindBufferARB( GL_ARRAY_BUFFER_ARB, 0 );
}

GL_ERRORCHECK();

#if USEIBO
{
static map<U16*,VtxBufH> IdxBufferMap;

if( hPB )
{
VtxBufH hIB = MagSTXFindValFromKey( IdxBufferMap, pIDX, (VtxBufH) 0 );

if( 0==hIB )
{
glGenBuffersARB( 1, &amp;hIB );
glBindBufferARB( GL_ELEMENT_ARRAY_BUFFER_ARB, hIB );
glBufferDataARB( GL_ELEMENT_ARRAY_BUFFER_ARB, iNumIDX*sizeof(U16), pIDX, GL_STATIC_DRAW_ARB );
MagSTXMapInsert( IdxBufferMap, pIDX, hIB );
GL_ERRORCHECK();
}
else
{
glBindBufferARB( GL_ELEMENT_ARRAY_BUFFER_ARB, hIB );
GL_ERRORCHECK();
}
pIDX = 0;

}

}
#endif

}


#endif

////////////////////////////////////////////////////////////////////
// set stream format

switch( eStrFmt )
{
case EVTXSTREAMFMT_V12C4N6I2T8:
{
glEnableVertexAttribArrayARB( 0 );
glEnableVertexAttribArrayARB( 1 );
glEnableVertexAttribArrayARB( 2 );
glEnableVertexAttribArrayARB( 3 );
glEnableVertexAttribArrayARB( 4 );

//////////////////////////////////////////////////////////////////////
#if( _BUILD_LEVEL > 1 )
//////////////////////////////////////////////////////////////////////
int iRenderMode = CSystem::GetGlobalIntVariable( "iRenderMode" );
switch( iRenderMode )
{
case 3: // route Normals to color
glVertexAttribPointerARB( 0, 3, GL_FLOAT, KVANRM_FALSE, iStride, (void*) & pVtxBase[0] ); // V12
glVertexAttribPointerARB( 2, 3, GL_SHORT, KVANRM_TRUE, iStride, (void*) & pVtxBase[16] ); // N6
glVertexAttribPointerARB( 3, 1, GL_UNSIGNED_SHORT, KVANRM_FALSE, iStride, (void*) & pVtxBase[22] ); // I2
glVertexAttribPointerARB( 4, 2, GL_FLOAT, KVANRM_FALSE, iStride, (void*) & pVtxBase[24] ); // T8
glVertexAttribPointerARB( 1, 3, GL_SHORT, KVANRM_TRUE, iStride, (void*) & pVtxBase[16] ); // N6
break;
case 2: // route UVs to color
glVertexAttribPointerARB( 0, 3, GL_FLOAT, KVANRM_FALSE, iStride, (void*) & pVtxBase[0] ); // V12
glVertexAttribPointerARB( 2, 3, GL_SHORT, KVANRM_TRUE, iStride, (void*) & pVtxBase[16] ); // N6
glVertexAttribPointerARB( 3, 1, GL_UNSIGNED_SHORT, KVANRM_FALSE, iStride, (void*) & pVtxBase[22] ); // I2
glVertexAttribPointerARB( 4, 2, GL_FLOAT, KVANRM_FALSE, iStride, (void*) & pVtxBase[24] ); // T8
glVertexAttribPointerARB( 1, 2, GL_FLOAT, KVANRM_FALSE, iStride, (void*) & pVtxBase[24] ); // T8
break;
case 0:
default:
glVertexAttribPointerARB( 0, 3, GL_FLOAT, KVANRM_FALSE, iStride, (void*) & pVtxBase[0] ); // V12
glVertexAttribPointerARB( 1, 4, GL_UNSIGNED_BYTE, KVANRM_TRUE, iStride, (void*) & pVtxBase[12] ); // C4
glVertexAttribPointerARB( 2, 3, GL_SHORT, KVANRM_TRUE, iStride, (void*) & pVtxBase[16] ); // N6
glVertexAttribPointerARB( 3, 1, GL_UNSIGNED_SHORT, KVANRM_FALSE, iStride, (void*) & pVtxBase[22] ); // I2
glVertexAttribPointerARB( 4, 2, GL_FLOAT, KVANRM_FALSE, iStride, (void*) & pVtxBase[24] ); // T8
break;
}
//////////////////////////////////////////////////////////////////////
#else
//////////////////////////////////////////////////////////////////////
glVertexAttribPointerARB( 0, 3, GL_FLOAT, KVANRM_FALSE, iStride, (void*) & pVtxBase[0] ); // V12
glVertexAttribPointerARB( 1, 4, GL_UNSIGNED_BYTE, KVANRM_TRUE, iStride, (void*) & pVtxBase[12] ); // C4
glVertexAttribPointerARB( 2, 3, GL_SHORT, KVANRM_TRUE, iStride, (void*) & pVtxBase[16] ); // N6
glVertexAttribPointerARB( 3, 1, GL_UNSIGNED_SHORT, KVANRM_FALSE, iStride, (void*) & pVtxBase[22] ); // I2
glVertexAttribPointerARB( 4, 2, GL_FLOAT, KVANRM_FALSE, iStride, (void*) & pVtxBase[24] ); // T8
//////////////////////////////////////////////////////////////////////
#endif
//////////////////////////////////////////////////////////////////////
GL_ERRORCHECK();

break;
}
case EVTXSTREAMFMT_V12C4T4:
{ glDisableVertexAttribArrayARB( 3 );
glDisableVertexAttribArrayARB( 4 );
glEnableVertexAttribArrayARB( 0 );
glEnableVertexAttribArrayARB( 1 );
glEnableVertexAttribArrayARB( 2 );
glVertexAttribPointerARB( 0, 3, GL_FLOAT, KVANRM_FALSE, iStride, (void*) & pVtxBase[0] ); // V12
glVertexAttribPointerARB( 1, 4, GL_UNSIGNED_BYTE, KVANRM_TRUE, iStride, (void*) & pVtxBase[12] ); // C4
glVertexAttribPointerARB( 2, 2, GL_SHORT, KVANRM_FALSE, iStride, (void*) & pVtxBase[16] ); // N6
GL_ERRORCHECK();
break;
}
case EVTXSTREAMFMT_V4C4:
{ glDisableVertexAttribArrayARB( 2 );
glDisableVertexAttribArrayARB( 3 );
glDisableVertexAttribArrayARB( 4 );
glEnableVertexAttribArrayARB( 0 );
glEnableVertexAttribArrayARB( 1 );
glVertexAttribPointerARB( 0, 2, GL_SHORT, KVANRM_FALSE, 8, (void*) & pVtxBase[0] ); // V4
glVertexAttribPointerARB( 1, 4, GL_UNSIGNED_BYTE, KVANRM_TRUE, 8, (void*) & pVtxBase[4] ); // C4
GL_ERRORCHECK();
break;
}
case EVTXSTREAMFMT_V4T4:
{ glDisableVertexAttribArrayARB( 2 );
glDisableVertexAttribArrayARB( 3 );
glDisableVertexAttribArrayARB( 4 );
glEnableVertexAttribArrayARB( 0 );
glEnableVertexAttribArrayARB( 1 );
glVertexAttribPointerARB( 0, 2, GL_SHORT, KVANRM_FALSE, 8, (void*) & pVtxBase[0] ); // V4
glVertexAttribPointerARB( 1, 2, GL_SHORT, KVANRM_FALSE, 8, (void*) & pVtxBase[4] ); // T4
GL_ERRORCHECK();
break;
}
default:
break;
}

////////////////////////////////////////////////////////////////////
// draw it

int inumpasses = mpCurMaterial->GetNumPasses();
for( int ipass=0; ipass<inumpasses; ipass++ )
{
mpCurMaterial->SetupPass( ipass );

static bool lbwire = false;

if( iNumIDX ) switch( eTyp ) // Primitive / Indices Override
{
case EPRIM_LINES:
GL_ERRORCHECK();
glDrawElements( GL_LINES, iNumIDX, GL_UNSIGNED_SHORT, pIDX );
GL_ERRORCHECK();
break;
case EPRIM_TRIANGLES:
GL_ERRORCHECK();
glDrawElements( GL_TRIANGLES, iNumIDX, GL_UNSIGNED_SHORT, pIDX );
miTrianglesRendered += (iNumIDX/3);
GL_ERRORCHECK();
break;
case EPRIM_TRIANGLESTRIP:
GL_ERRORCHECK();
glDrawElements( GL_TRIANGLE_STRIP, iNumIDX, GL_UNSIGNED_SHORT, pIDX );
miTrianglesRendered += (iNumIDX-2);
GL_ERRORCHECK();
break;
default:
glDrawElements( GL_POINTS, iNumIDX, GL_UNSIGNED_SHORT, pIDX );
MagAssert( false );
break;

}
else switch( pVBuf->GetPrimType() )
{ case EPRIM_LINES:
GL_ERRORCHECK();
glDrawArrays( GL_LINES, 0, iNum );
GL_ERRORCHECK();
break;
case EPRIM_TRIANGLES:
GL_ERRORCHECK();
glDrawArrays( GL_TRIANGLES, 0, iNum );
GL_ERRORCHECK();
break;
default:
glDrawArrays( GL_POINTS, 0, iNum );
MagAssert( false );
break;
}

PopRenderState();

} // for( int ipass=0; ipass<inumpasses; ipass++ )
//////////////////////////////////////////////////////

//glPopClientAttrib();

glBindBufferARB( GL_ARRAY_BUFFER_ARB, 0 );

} // if( iNum )

GL_ERRORCHECK();

}


[This message has been edited by tweakoz (edited 11-03-2003).]

[This message has been edited by tweakoz (edited 11-03-2003).]

tweakoz
10-31-2003, 12:09 PM
I Forgot to mention the all the VBO'sare static (so far... ;> )

mtm

Elixer
10-31-2003, 01:10 PM
Slow compared to what? What res/depth? Full screen or window?

tweakoz
10-31-2003, 01:25 PM
Originally posted by Elixer:
Slow compared to what? What res/depth? Full screen or window?



I assumed since I was posting to the advanced forum, that anyone else would (or should)assume I have already ruled out fillrate or other unrelated issues. (I would have posted to beginners forum otherwise ;> )

The comparison (44FPS vs 8FPS) is achieved merely by altering the USE_VBO define. That define is ONLY referenced in one place, which is the posted code fragment. everything else is identical, including the visual results.

There is only one path in this code responsible for the slowdown, in that path, eStrFmt is EVTXSTREAMFMT_V12C4N6I2T8 (skinned) and eTyp is EPRIM_TRIANGLESTRIP, and VBOS are active in that case.

I am 99% sure this is driver related (52.16),
but if anyone with VBO experience on Nvidia FX class cards knows anything about this, if Im perhaps not initializing something properly, please reply... ;>

mtm

SirKnight
10-31-2003, 01:29 PM
Hmmm...I'm guessing you tried other driver versions? I havn't messed with VBOs as much as I would have liked to by now, but is it possible to get the full source so we can have a go at it?


-SirKnight

Zengar
10-31-2003, 04:07 PM
You are using index/vertex data in one buffer, right(if I got it correctly - I have not much experiance reading c++ code)? Split them in this case. Thare is an issue in a driver.

tweakoz
10-31-2003, 05:39 PM
Originally posted by Zengar:
You are using index/vertex data in one buffer, right(if I got it correctly - I have not much experiance reading c++ code)? Split them in this case. Thare is an issue in a driver.



GL's discussion board likes to 'unformat' my text, i always have a problem with it, probably makes the c++ harder to read ;<

Im not using VBO's for index buffers currently, Im using standard host memory for index buffers right now.

happy halloween everyone, im off to get drunk ;>

mtm

JanHH
10-31-2003, 05:44 PM
don't spend your money on alcohol, but buy a new (faster) graphics card http://www.opengl.org/discussion_boards/ubb/smile.gif. Like me, I drink a lot and still use an gf4..

SirKnight
10-31-2003, 06:00 PM
GL's discussion board likes to 'unformat' my text, i always have a problem with it, probably makes the c++ harder to read ;<


Use the code tags to remedy that problem. [ code] code here [ /code]. Of course without the space after the first [.

-SirKnight

Zengar
10-31-2003, 10:02 PM
I will always have problem reading C++ code so it won't help 8)

I tried to move to C++ some time ago but after two days I was sick of it and now I'm back with my old good Delphi.

roffe
10-31-2003, 10:09 PM
Originally posted by Zengar:
I tried to move to C++ some time ago but after two days I was sick of it...



Wow, two whole days. That's almost too much. I can't see how you managed... http://www.opengl.org/discussion_boards/ubb/smile.gif

Elixer
11-01-2003, 02:04 PM
Originally posted by tweakoz:
I assumed since I was posting to the advanced forum, that anyone else would (or should)assume I have already ruled out fillrate or other unrelated issues. (I would have posted to beginners forum otherwise ;> )

The comparison (44FPS vs 8FPS) is achieved merely by altering the USE_VBO define. That define is ONLY referenced in one place, which is the posted code fragment. everything else is identical, including the visual results.

There is only one path in this code responsible for the slowdown, in that path, eStrFmt is EVTXSTREAMFMT_V12C4N6I2T8 (skinned) and eTyp is EPRIM_TRIANGLESTRIP, and VBOS are active in that case.

I am 99% sure this is driver related (52.16),
but if anyone with VBO experience on Nvidia FX class cards knows anything about this, if Im perhaps not initializing something properly, please reply... ;>

mtm

LOL... you can never tell these days, I see lots of posts from beginners in this forum, so we never know who we are talking to.

Those are the new force drivers right? Try one of the older dets, and see if it is the same. I would try now, but I switched to a 9600, and well, those drivers have issues also. With 2 lockups, 4 "recover" alerts, and 3 just black screens, makes me think I am dealing with the same old ATI rage 128 drivers. BTW, I tried cat 3.8, and then tried 3.7s. Those seem more stable, but again, they still have issues. http://www.opengl.org/discussion_boards/ubb/frown.gif

OOps, looks like a hijacked this thread. sorry... maybe I need a drink now http://www.opengl.org/discussion_boards/ubb/smile.gif

tweakoz
11-03-2003, 10:50 AM
Originally posted by Elixer:
LOL... you can never tell these days, I see lots of posts from beginners in this forum, so we never know who we are talking to.

Yes, I can see there are still beginners posting alot in this forum, but I think the forum should still give me (or any joe poster) the benefit of the doubt, until proven otherwise.... Once the poster has given enough info, It would be easy enough to determine if the question belongs in the beginners forum. Even then its not that big of deal to just ignore the question, or reccomend that it be asked in the beginners forum. Or perhaps a moderator could move the question, If the forum server software supports that.

It just gets old and takes up too much time to have to specify details that should be obvious. (Luckily our game shipped already and our crunch is over so I dont mind right now...) ;>


Those are the new force drivers right? Try one of the older dets, and see if it is the same. I would try now, but I switched to a 9600, and well, those drivers have issues also. With 2 lockups, 4 "recover" alerts, and 3 just black screens, makes me think I am dealing with the same old ATI rage 128 drivers. BTW, I tried cat 3.8, and then tried 3.7s. Those seem more stable, but again, they still have issues. http://www.opengl.org/discussion_boards/ubb/frown.gif


I actually wrote the VBO code several months ago with the 44/45 series detonators, now Im on 52.16, the results were the same. I have a 9500 pro but its in my linux box at home, maybe I will have to swap them again.... Last time I checked the 9500pro didnt like VBO and it didnt like generic vertex attribs, actually that was the only reason I took that 9500 out of my work machine, otherwise I liked that card... this 5200 was more expensive and is way slower.... but the drivers seem slightly more mature. It couldnt possibly be an AGP problem, If the geometry is resident, the AGP traffic should decrease, even the non VBO path had to pass index buffers over the AGP bus.

I also wrote DX8 and DX9 renderers for this engine, although their code is out of date. My engine's architecture changes alot and I dont always update every system..., maybe I will have to try them out see if it fairs better.

I prefer GL, but occasionally DX seems to support certain features better (like resident VB's) VBO's still seem unusable to me ...

So does this mean no one has seen actual problems with the source code I have posted ?



OOps, looks like a hijacked this thread. sorry... maybe I need a drink now http://www.opengl.org/discussion_boards/ubb/smile.gif


Im still paying for this weekends halloween partying. My head hurts.....

mtm


[This message has been edited by tweakoz (edited 11-03-2003).]

[This message has been edited by tweakoz (edited 11-03-2003).]

Elixer
11-03-2003, 11:10 AM
So you can see if it is a driver issue, get this: http://www.codesampler.com/source/ogl_vertex_buffer_objects.zip

And running it...With VBO I get around 250fps, without (Vertex array) I get 30.
(this is just by running the program, and toggling VBO on/off with F1.)

I know it isn't quite what you had in mind, but it may shed a crumb of info for you.

Csiki
11-03-2003, 12:09 PM
Originally posted by tweakoz:
why is the following code so slow on a NVidia Geforce FX 5200 Ultra ?


I had the same problem with GeforceTi4200.
It seems that the new 52.16 driver do same cruel optimization with the static data, but have a lot of problem with it if it's changing frequently (one change in all frame)...
Use DYNAMIC_DRAW instead. Unfortunately this seems to be a simple vertex array implementation... http://www.opengl.org/discussion_boards/ubb/frown.gif

SirKnight
11-03-2003, 12:32 PM
Strange, on that demo I get 45fps on both modes. VBO didn't help at all. I have a GeForce FX 5600 Ultra, Force Ware drivers, AMD XP 2600+ and 512mb DDR w/ WinXP Pro.


-SirKnight

SirKnight
11-03-2003, 12:39 PM
Ok.....I turned vsync off and now I get 68 to 69. That's also weird as my refresh is at 85Hz. Also again, VBO did nothing to help.


-SirKnight

Elixer
11-03-2003, 01:11 PM
Originally posted by SirKnight:
Ok.....I turned vsync off and now I get 68 to 69. That's also weird as my refresh is at 85Hz. Also again, VBO did nothing to help.

-SirKnight

I forgot to say that I was using a 9600 card with cat 3.7 drivers on a 1900+ 512MB/win2k system. I guess the Nvidia drivers do have a bug since you can't get above 68.

[This message has been edited by Elixer (edited 11-03-2003).]

Adrian
11-03-2003, 01:26 PM
I have a GF5900Ultra + Dets 52.16 Win XP 2000+, I get 30 fps with vertex arrays and 210 fps with VBO. The drivers work fine for me.

SirKnight
11-03-2003, 01:39 PM
These are very interesting results. The 9600 beats the 5900U by a good amount and at the same time both of those cards benefit greatly with VBO. Yet my card does not. I guess there is a bug with the NV30 chipset and these new drivers. Strange because the NV35 really isn't that much different than the NV30s, a little bit but not anything super significant.

Maybe I'll try the older 4x.xx drivers and see what happens.


-SirKnight

SirKnight
11-03-2003, 01:44 PM
WTF! I turn FSAA to 8x from 2xQ and I get 76fps. With it at 2xQ I get almost 10 less. 2xQ is supposed to be WAY faster (with less smoothing of course) right? I also get 76 to 80 with FSAA off. Something is REALLY weird.

EDIT: Ok that is the case for some reason in that VBO earth app. In Quake 3 I lost about 150fps from 2xQ to 8x FSAA. http://www.opengl.org/discussion_boards/ubb/biggrin.gif


-SirKnight


[This message has been edited by SirKnight (edited 11-03-2003).]

tweakoz
11-03-2003, 02:36 PM
Originally posted by Csiki:
I had the same problem with GeforceTi4200.
It seems that the new 52.16 driver do same cruel optimization with the static data, but have a lot of problem with it if it's changing frequently (one change in all frame)...
Use DYNAMIC_DRAW instead. Unfortunately this seems to be a simple vertex array implementation... http://www.opengl.org/discussion_boards/ubb/frown.gif



I NEVER change the vertex data once initially loaded, I only change vertex program parameters (shader constants).

mtm

tweakoz
11-03-2003, 02:41 PM
Originally posted by Elixer:
So you can see if it is a driver issue, get this: http://www.codesampler.com/source/ogl_vertex_buffer_objects.zip

And running it...With VBO I get around 250fps, without (Vertex array) I get 30.
(this is just by running the program, and toggling VBO on/off with F1.)

I know it isn't quite what you had in mind, but it may shed a crumb of info for you.



I get 120 with VBO, 20 without - however that source is not using ARB_VERTEX_PROGRAM and it is not using generic vertex attributes.

Im reasonably sure a GFFX5200Ultra has hardare ARB_VERTEX_PROGRAM support, so it shouldnt be agp transfer screwing things up here...

mtm

[This message has been edited by tweakoz (edited 11-03-2003).]

glw
11-03-2003, 03:24 PM
In my own experience Radeons
are beating GeforceFX using
VBO with separate static
arrays, running 3.8 against 52.16.

I put all my data into VBO memory
and just call draw elements with
buffer offsets. The speed up over
system resident arrays is huge
on the Radeons (500%), and much
smaller with the GeforceFX (~100%).

Two figures I can find at the
moment ( I've a lot more but
they are on my other PC).
GeforceFX 5600 - 25.2 MTri/s
Radeon 9800 Pro - 183 MTri/s
5900 is ~ 60 MTri/s IIRC.
Those figures are from
high spec 3GHz+ P4s
running XP Pro.

To the best of my knowledge
I'm not doing anything daft,
as I've followed the spec
closely, and tried to account
for all other variations.

VBO appears to work well
on ATI, but need fixes
or optimisations on Nvidia.

SirKnight
11-03-2003, 03:34 PM
Ok I just ran the program again without changing anything and it goes from 71 to 135 fps. Strange how VBO all of a sudden makes a difference when a while ago it didn't. Still though, this seems like it should be faster.


-SirKnight

tweakoz
11-03-2003, 05:04 PM
I 'hacked' in VBO based indices and edited the orginal post with the new code. This did not get me any FPS back ;<

mtm

tweakoz
11-03-2003, 05:22 PM
I should also add, with my 44FPS vs 8FPS(VBO) benchmark everything is batched reasonably. I would estimate 7 tristripped batches per frame with about 3200 indices per batch.

mtm

jwatte
11-03-2003, 08:28 PM
First: Run VTune (or another sampling profiler) on your system while the program is running, both with VBO and without it. You'll probably find a BIG spike somewhere in the VBO case. This is where you're spending all your time. Look at the code (may need disassembling) -- what is it trying to do? Packing/unpacking values? Copying data? Calculating min/max? Whatever it's doing, figure out what part of the OpenGL API woudl need that performed, and make it not necessary by adjusting how you call it.

For example, if it's un-packing, say, signed bytes to floating-point (and this is just as a wild example), and you're passing normals in as signed bytes, then you can draw the conclusion that this is not a supported data format, and you're better off passing normals as float.

Second issue: if you get the "VPU Recover" alerts, then it's very likely that your motherboard, memory bus, or AGP bus is not quite up to spec, and there's either a chipset bug, or a signal quality problem. Raising voltages a little bit may help if it's the latter; if it's the former, get a better mobo http://www.opengl.org/discussion_boards/ubb/smile.gif

Elixer
11-03-2003, 10:07 PM
Originally posted by jwatte:

Second issue: if you get the "VPU Recover" alerts, then it's very likely that your motherboard, memory bus, or AGP bus is not quite up to spec, and there's either a chipset bug, or a signal quality problem. Raising voltages a little bit may help if it's the latter; if it's the former, get a better mobo http://www.opengl.org/discussion_boards/ubb/smile.gif

Well, that could be, then again, I think it is driver bugs, or very picky timing for cat drivers, since those same programs I tried work just fine with my old GF2 card.

Just curious what is a 'better' mobo? I mean, what do you consider good?

Zengar
11-03-2003, 10:25 PM
According to the small demo:
60 FPS without VBO
150 FPS with VBO

FX5600(52.16 - slightly overclocked http://www.opengl.org/discussion_boards/ubb/smile.gif ), Athlon XP 1800+(now reborn as 2000+), nForce2, AGP 8X, fast writes enabled, sideband adressing disabled

tweakoz
11-13-2003, 10:30 AM
Originally posted by tweakoz:
[B]why is the following code so slow on a NVidia Geforce FX 5200 Ultra ?


With NVidia's help I figured it out, and will post the results here in case anyone else runs into this:




//////////////////////////////////////////////////////////
this is fast: glVertexAttribPointerARB( 2, 3, GL_SHORT, KVANRM_FALSE, 32, (void*) 16 );
this is slow: glVertexAttribPointerARB( 2, 3, GL_SHORT, KVANRM_TRUE, 32, (void*) 16 );

slow: glVertexAttribPointerARB( 3, 1, GL_UNSIGNED_SHORT, KVANRM_FALSE, 32, (void*) 22 );
slow: glVertexAttribPointerARB( 3, 4, GL_UNSIGNED_BYTE, KVANRM_FALSE, 32, (void*) 22 ); // I2
fast: glVertexAttribPointerARB( 3, 4, GL_UNSIGNED_BYTE, KVANRM_TRUE, 32, (void*) 22 ); // I2
fast: glVertexAttribPointerARB( 3, 2, GL_UNSIGNED_BYTE, KVANRM_TRUE, 32, (void*) 22 ); // I2



So shorts and bytes are opposites as far as normalization is concerned.

here is an excerpt from an email from NVIDIA



> At first blush, my guess is that in the VBO case, the use of
> an attrib array as UNSIGNED_SHORT is causing us to fall back
> to non-pulling paths; we can't do vertex pulling with all
> types of data. I looked quickly and I can see that for
> generic attribs, SHORT, FLOAT, and HALF_FLOAT are supported
> for pulling unless the attrib is normalized and then
> UNSIGNED_BYTE is added and SHORT is removed. In fact, I don't
> think we support UNSIGNED_SHORT vertex pulling in any circumstance.
>
> When we fall back to inline methods, if the VBOs are in AGP
> memory, we read the data from there - which is very slow.
> This is likely the reason why the performance drops so much.
> As for why it speeds up in non-VBO, the data is put in system
> memory so the read to place it in the pushbuffer is fast.
>
> If the user changed the UNSIGNED_SHORT to SHORT, FLOAT, or
> HALF_FLOAT, performance should be greatly improved.



mtm

tweakoz
11-13-2003, 11:29 AM
Originally posted by tweakoz:



//////////////////////////////////////////////////////////
this is fast: glVertexAttribPointerARB( 2, 3, GL_SHORT, KVANRM_FALSE, 32, (void*) 16 );
this is slow: glVertexAttribPointerARB( 2, 3, GL_SHORT, KVANRM_TRUE, 32, (void*) 16 );

slow: glVertexAttribPointerARB( 3, 1, GL_UNSIGNED_SHORT, KVANRM_FALSE, 32, (void*) 22 );
slow: glVertexAttribPointerARB( 3, 4, GL_UNSIGNED_BYTE, KVANRM_FALSE, 32, (void*) 22 ); // I2
fast: glVertexAttribPointerARB( 3, 4, GL_UNSIGNED_BYTE, KVANRM_TRUE, 32, (void*) 22 ); // I2
fast: glVertexAttribPointerARB( 3, 2, GL_UNSIGNED_BYTE, KVANRM_TRUE, 32, (void*) 22 ); // I2



So shorts and bytes are opposites as far as normalization is concerned.

here is an excerpt from an email from NVIDIA


mtm

here is more useful info from NVIDIA.



For HALF_FLOAT usage, you should simply be able to pass in GL_HALF_FLOAT_NV
for the data type in place of GL_FLOAT. However, this will only be
accelerated on FX or better hardware - NV30 and beyond.

If you choose to use this data type, one issue you'll need to keep in mind
is that you'll need to format your data for use as half floats - you can't
simply use the same 32bit float or 16bit short data. The spec defines the
format and conversion:
http://oss.sgi.com/projects/ogl-sample/registry/NV/half_float.txt



mtm