Shader floating point precision

I’m having a pretty tough problem with the floating point precision in my shader. What I want to do is, I have a framebuffer object which has a GL_UNSIGNED_SHORT texture for the color attachment. I want to do some scene processing and then, in my shader, use the fbo as an accumulator for the data that I’m processing. So, every time my condition (emulated by the “TestMap” shader below) is true, my plan is to return a result of .0001 for that particular texel, which is later aggregated with a lot of other .0001 results into my result buffer.

The problem is, as this test shows, if you change the line at the bottom to DrawStuff(0.5f) it works fine, 0.1f works fine, 0.01f works fine, .005f works fine, but below about .0023 or so it starts generating pure black textures, which tells me that for some reason something somewhere is rounding my floating points that are too small down to 0, probably because of a precision problem. That’s not cool because with an unsigned short buffer I should be getting at least 1/2^16 precision, or about .00015 and since I need to do many thousands of exposures, using a value of .001 won’t work for me.

I did this before with a floating point buffer and just returning 1 instead of .0001, but I’d rather get around the rather recent hardware requirements of floating point buffers. If there were some way (without using a more modern opengl) to access the short values of my buffer directly instead of working with just floats, I would love to do that. Or if there were some other way to do this while keeping the hardware requirements lowish, I’m all ears.

(There may be some messy bits, because I had to tear it out of my project to make it simple enough to post here.)

#include <GL/glew.h>
#include <GL/freeglut.h>

const char* GetVSTestMap()
{
	return
		"varying vec3 vecSurfaceNormal;"

		"void main()"
		"{"
		"	vecSurfaceNormal = normalize(gl_NormalMatrix * gl_Normal);"
		"	gl_Position = gl_ModelViewProjectionMatrix * gl_Vertex;"
		"	gl_FrontColor = gl_Color;"
		"}";
}

const char* GetFSTestMap()
{
	return
		"uniform float flMaxValue;"
		"varying vec3 vecSurfaceNormal;"

		"void main()"
		"{"
		"	float flShadow = flMaxValue;"

		"	if (dot(vecSurfaceNormal, vec3(0.0, 0.0, 1.0)) < 0.0)"
		"		flShadow = 0.0;"

		"	gl_FragColor = vec4(flShadow, flShadow, flShadow, flMaxValue);"
		"}";
}

const char* GetVSResultMap()
{
	return
		"varying vec2 vecUV;"

		"void main()"
		"{"
		"	gl_Position = ftransform();"
		"	gl_FrontColor = gl_Color;"
		"	vecUV = gl_MultiTexCoord0.st;"
		"}";
}

const char* GetFSResultMap()
{
	return
		"uniform sampler2D iResultMap;"
		"varying vec2 vecUV;"

		"void main()"
		"{"
		"	vec4 vecColor = texture2D(iResultMap, vecUV);"
		"	if (vecColor.r == 0.0)"
		"		gl_FragColor = vec4(0.0, 0.0, 0.0, 1.0);"
		"	else"
		"		gl_FragColor = vec4(1.0, 1.0, 1.0, 1.0);"
		"}";
}

void DrawTexture(GLuint iTexture, float flScale = 1.0f)
{
	glClear(GL_DEPTH_BUFFER_BIT);

	glMatrixMode(GL_PROJECTION);
	glPushMatrix();
	glLoadIdentity();

	glMatrixMode(GL_MODELVIEW);
	glPushMatrix();
	glLoadIdentity();

	glMatrixMode(GL_TEXTURE);
	glPushMatrix();
	glLoadIdentity();

	glPushAttrib(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT|GL_ENABLE_BIT|GL_TEXTURE_BIT);

	glDisable(GL_LIGHTING);
	glDisable(GL_DEPTH_TEST);
	glEnable(GL_TEXTURE_2D);

	glShadeModel(GL_SMOOTH);

	glBindTexture(GL_TEXTURE_2D, iTexture);

	glColor3f(1.0f, 1.0f, 1.0f);
	glBegin(GL_QUADS);
		glTexCoord2f(0.0f, 0.0f);
		glVertex2f(-flScale, -flScale);

		glTexCoord2f(0.0f, 1.0f);
		glVertex2f(-flScale, flScale);

		glTexCoord2f(1.0f, 1.0f);
		glVertex2f(flScale, flScale);

		glTexCoord2f(1.0f, 0.0f);
		glVertex2f(flScale, -flScale);
	glEnd();

	glPopAttrib();

	glMatrixMode(GL_PROJECTION);
	glPopMatrix();

	glMatrixMode(GL_MODELVIEW);
	glPopMatrix();

	glMatrixMode(GL_TEXTURE);
	glPopMatrix();
}

GLuint iUVMap;
GLuint iTestProgram;
GLuint iUVFB;
GLuint iSceneList;
GLuint iResultProgram;

void DrawStuff(float flMaxValue)
{
	glBindFramebufferEXT(GL_FRAMEBUFFER, iUVFB);

	glPushAttrib(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT|GL_ENABLE_BIT|GL_TEXTURE_BIT);

	glUseProgram(iTestProgram);

	GLuint iResultMapMaxShadow = glGetUniformLocation(iTestProgram, "flMaxValue");
	glUniform1f(iResultMapMaxShadow, flMaxValue);

	glMatrixMode(GL_MODELVIEW);
	glLoadIdentity();
	gluLookAt(
		5, 5, 10,
		0, 0, 0,
		0, 1, 0);

	glDisable(GL_DEPTH_TEST);

	glCallList(iSceneList);

	glBindTexture(GL_TEXTURE_2D, 0);
	glActiveTexture(GL_TEXTURE0);

	glUseProgram(0);

	glPopAttrib();

	glBindFramebufferEXT(GL_FRAMEBUFFER, 0);

	glBindTexture(GL_TEXTURE_2D, iUVMap);

	glDrawBuffer(GL_FRONT);
	glReadBuffer(GL_FRONT);
	glUseProgram(iResultProgram);
	GLuint iResultMapUniform = glGetUniformLocation(iResultProgram, "iResultMap");
	glUniform1i(iResultMapUniform, 0);
	DrawTexture(iUVMap);
	glUseProgram(0);
}

int main(int argc, char** argv)
{
	glutInit(&argc, argv);
	glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE | GLUT_DEPTH | GLUT_ALPHA | GLUT_MULTISAMPLE);

	glutInitWindowPosition(0, 0);
	glutInitWindowSize((int)1024, (int)1024);

	// The easy way to get a "windowless" context.
	glutCreateWindow("Precision test");

	glewInit();

	glutMainLoopEvent();

	// Tuck away our current stack so we can return to it later.
	glMatrixMode(GL_PROJECTION);
	glPushMatrix();
	glLoadIdentity();

	glMatrixMode(GL_MODELVIEW);
	glPushMatrix();
	glLoadIdentity();

	// Create a list with the required polys so it draws quicker.
	iSceneList = glGenLists(1);

	glNewList(iSceneList, GL_COMPILE);

	glutSolidTeapot(2.5);

	glEndList();

	glPushAttrib(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT|GL_ENABLE_BIT|GL_TEXTURE_BIT);

	// Clear red so that we can pick out later what we want when we're reading pixels.
	glClearColor(1, 0, 0, 1);

	glDisable(GL_CULL_FACE);

	GLsizei iShadowMapSize = 1024;

	iUVMap;
	glGenTextures(1, &iUVMap);
	glBindTexture(GL_TEXTURE_2D, iUVMap);
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
	glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, (GLsizei)512, (GLsizei)512, 0, GL_RGBA, GL_UNSIGNED_SHORT, NULL);
	glBindTexture(GL_TEXTURE_2D, 0);

	GLuint iUVRB;
	glGenRenderbuffersEXT(1, &iUVRB);
	glBindRenderbufferEXT( GL_RENDERBUFFER, iUVRB );
	glRenderbufferStorageEXT( GL_RENDERBUFFER, GL_DEPTH_COMPONENT, (GLsizei)512, (GLsizei)512 );
	glBindRenderbufferEXT( GL_RENDERBUFFER, 0 );

	// A frame buffer for holding the UV layout once it is rendered flat with the shadow
	iUVFB;
	glGenFramebuffersEXT(1, &iUVFB);
	glBindFramebufferEXT(GL_FRAMEBUFFER, iUVFB);
	glFramebufferTexture2DEXT(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, iUVMap, 0);
	glFramebufferRenderbufferEXT(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_RENDERBUFFER, iUVRB);	// Unused
	glBindFramebufferEXT(GL_FRAMEBUFFER, 0);

	GLuint iTestVertexShader = glCreateShader(GL_VERTEX_SHADER);
	const char* pszShaderSource = GetVSTestMap();
	glShaderSource(iTestVertexShader, 1, &pszShaderSource, NULL);
	glCompileShader(iTestVertexShader);

#ifdef _DEBUG
	int iLogLength = 0;
	char szLog[1024];
	glGetShaderInfoLog(iTestVertexShader, 1024, &iLogLength, szLog);
#endif

	GLuint iTestFragmentShader = glCreateShader(GL_FRAGMENT_SHADER);
	pszShaderSource = GetFSTestMap();
	glShaderSource(iTestFragmentShader, 1, &pszShaderSource, NULL);
	glCompileShader(iTestFragmentShader);

#ifdef _DEBUG
	glGetShaderInfoLog(iTestFragmentShader, 1024, &iLogLength, szLog);
#endif

	iTestProgram = glCreateProgram();
	glAttachShader(iTestProgram, iTestVertexShader);
	glAttachShader(iTestProgram, iTestFragmentShader);
	glLinkProgram(iTestProgram);

#ifdef _DEBUG
	glGetProgramInfoLog(iTestProgram, 1024, &iLogLength, szLog);
#endif

	GLuint iResultVertexShader = glCreateShader(GL_VERTEX_SHADER);
	pszShaderSource = GetVSResultMap();
	glShaderSource(iResultVertexShader, 1, &pszShaderSource, NULL);
	glCompileShader(iResultVertexShader);

#ifdef _DEBUG
	glGetShaderInfoLog(iResultVertexShader, 1024, &iLogLength, szLog);
#endif

	GLuint iResultFragmentShader = glCreateShader(GL_FRAGMENT_SHADER);
	pszShaderSource = GetFSResultMap();
	glShaderSource(iResultFragmentShader, 1, &pszShaderSource, NULL);
	glCompileShader(iResultFragmentShader);

#ifdef _DEBUG
	glGetShaderInfoLog(iResultFragmentShader, 1024, &iLogLength, szLog);
#endif

	iResultProgram = glCreateProgram();
	glAttachShader(iResultProgram, iResultVertexShader);
	glAttachShader(iResultProgram, iResultFragmentShader);
	glLinkProgram(iResultProgram);

#ifdef _DEBUG
	glGetProgramInfoLog(iResultProgram, 1024, &iLogLength, szLog);
#endif

	float flSize = 10;	// Length of the box's diagonal

	glMatrixMode(GL_PROJECTION);
	glLoadIdentity();
	glOrtho(-flSize/2, flSize/2, -flSize/2, flSize/2, 1, flSize*2);

	while (true)
	{
		glutMainLoopEvent();

		glViewport(0, 0, 1024, 1024);
		glClear(GL_COLOR_BUFFER_BIT|GL_DEPTH_BUFFER_BIT);

		glViewport(0, 0, 512, 512);
		DrawStuff(0.01f);
		glFinish();

		glFinish();	// So I can set a breakpoint
	}
}

You made a false assumption:

glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, (GLsizei)512, (GLsizei)512, 0, GL_RGBA, GL_UNSIGNED_SHORT, NULL);

Won’t create a 16-bit-per-component texture. You left the “internalformat” parameter just as GL_RGBA, which means to OpenGL “use whatever format you might think is suitable”. OpenGL will in most cases resort to GL_RGBA8 (8,8,8,8). Some years ago, some graphics card would even choose a 16bit internal format in this case.

Try GL_RGBA16 instead and see, if it works for you.

What hardware are you running on? Older hardware won’t support shorts as an internal format - it will convert to RGBA8

Wow. I could have sworn I tried every combination of GL_RGBAXXX and GL_UNSIGNED_XXXX. I must have missed this particular one! GL_RGBA16 does seem to work, but it’s very very slow, a lot slower than the floating point buffer was. GL_RGBA16UI and GL_RGBA16I don’t seem to work at all, give me just white maps.

I’ll have to investigate this slowness, I think it’s something I can deal with. Thanks!

Yeah, maybe GL_RGBA16F would work for you? The “white-out” with GL_RGBA16UI and GL_RGBA16I can be explained, because these formats are non-normalized. If you put a value of 1400 in one of those textures, you’ll get 1400 out of it when you sample it in the shader, not 1400/65535 as for GL_RGBA16. Now write a value of 1400 into a normalized framebuffer (values of 0…1 are accepted) and voilá you get a beautifully clamped white :wink:

Well then that sounds like more what I need. I’ve done so much googling to find what the difference between the UI and non-UI versions of these textures are and how they are handled differently, but I couldn’t find that. Is there information on it somewhere? Are they extensions or something? If I can address the pixels on those formats to add a value of 1 to it directly instead of specifying .0001 and hoping that it comes out to .0001*65535=~6.5, I’ll get much better precision out of my tools.

I’m not quite sure I understand your logic about how to address the UI/I versions of the textures. Suppose I try:

	vec4 vecColor = texture2D(iMap, vecUV);

vecColor will be [0, 65535] instead of [0, 1] ? Does that require SM4? (Trying to avoid that hardware requirement.) And when I do this:

	gl_FragColor = vec4(1.0, 10.0, 100.0, 1000.0);

Are those numbers also [0,65535] ? But you say they get clamped to 1 which causes white-out… why? How can I avoid that?

As for GL_RGBA16F, that’s a floating point buffer which would work (in fact I’m using it right now) but I’m trying to replace it because I’m trying to support older hardware. I’m hoping GL_RGBA16UI doesn’t require newer hardware like SM4.

I’m still having slowdown problems, but oddly enough the texture itself is not being slow, it’s my call to glutMainLoopEvent() a little bit down the line that is stupidly slow, but only if I have a GL_RGBA16 texture loaded. When it was GL_RGBA8 or GL_RGBA16F I never had any problems. When it is GL_RGBA16UI it works very fast as well, it seems that only GL_RGBA16 is causing glutMainLoopEvent() to slow down. So, I’d like to see if I can get the UI version working properly and thus avoid having to figure out this slowdown :stuck_out_tongue:

vecColor will be [0, 65535] instead of [0, 1] ?

No. This will simply fail to compile.

“texture” samples from floating-point or normalized integer texture formats. If “iMap” is an integral format (I or UI), then the attempt to render will fail.

You need to specifically state that you’re sampling from an integral format. If it’s a signed integral texture like GL_RGB8I, you have to use “itexture”, which returns an “ivec4”. Likewise, if you sample from GL_RGB8UI, you must use “utexture”, which returns an “uvec4”.

See the Wiki page on Samplers for more details.

Does that require SM4? (Trying to avoid that hardware requirement.)

Integral texture formats (I/UI) require “SM4”. That is, they require OpenGL 3.x-class hardware. So yes, the texture functions and samplers to access these texture require this.

Are those numbers also [0,65535] ?

It rather depends; unlike texture access, your shader alone cannot control this. It is controlled by the image format of your output buffer and the clamp modes you have set. If you are using a normalized integer format, then they will be clamped.

If you are using an integral or float format, you need to set the clamp write mode to off. I’ve forgotten what function sets this; you can find it in the spec somewhere.

As for GL_RGBA16F, that’s a floating point buffer which would work (in fact I’m using it right now) but I’m trying to replace it because I’m trying to support older hardware. I’m hoping GL_RGBA16UI doesn’t require newer hardware like SM4.

GL_RGBA16F is supported in anything even remotely recent. GL 2.x (ie: D3D 9) supports this. Integral textures require GL 3.x hardware. So you’re doing it backwards if you want to support more hardware.

Hmm. Yes it appears that the UI/I type textures are too recent then to support what I need. I have an old GeForce FX 5200 that I’m trying to get it working on, and it definitely does not do OpenGL 3.x, and won’t run shader version 1.4 stuff. Looks like I’ll have to figure out why glutMainLoopEvent() is so damn slow.