CUDA and OpenGL, asynchronous buffering

Dear all,

I have a high-speed CCD camera from which I extract three images, do some processing on them, and visualize the result. I use CUDA with OpenGL interoperability to do the GPU calculations and visualization straight from GPU memory. The problem I’m currently facing is that I want to have the camera frame extractions (and corresponding uploads to device memory) being done in parallel whilst I’m processing and visualizing the earlier result from the previous cycle of three frames. Is this something that can be done in a single-GPU setup?

Thanks in advance for any pointers in the correct solution,

Sam

Some more info on my implementation:

Currently, my approach is similar to what is described in this cuda-opengl interoperability presentation: http://www.nvidia.com/content/gtc/documents/1055_gtc09.pdf.

I extract three images from my camera’s pipeline, and transfer them to the GPU:

and I use


	float *d_result;
	cudaGraphicsMapResources(1, &cuda_pbo_resource, 0);
	cudaGraphicsResourceGetMappedPointer((void**)&d_result, &num_bytes, cuda_pbo_resource);

to retrieve the pointer in device memory at which location I do my further CUDA calculations.
When ready, the texture is bound and the image is displayed:


cudaThreadSynchronize();

	cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0);

	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

	glBindTexture(GL_TEXTURE_2D, texid);


	__glewBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
	glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, ncols, nrows, GL_LUMINANCE, GL_FLOAT,0);
	__glewBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

my full display routine:


void display(void) 
{   
	cudaProfilerStart();

	sdkStartTimer(&timertotal);
	sdkStartTimer(&timerextractimages);

	extract3ImagesFromCam(); // here the images are extracted from the camera pipeline (see below for function)

	sdkStopTimer(&timerextractimages);
	sdkStartTimer(&timerprocessing);

	// Reserve a memory address in GPU memory and link with OpenGL
	float *d_result;
	cudaGraphicsMapResources(1, &cuda_pbo_resource, 0);
	size_t num_bytes;
	cudaGraphicsResourceGetMappedPointer((void**)&d_result, &num_bytes, cuda_pbo_resource);

	launch_calculations(d_result, d_phase1, d_phase2, d_phase3, ncols, nrows, use_LUT, x_data_LUT_d, y_data_LUT_d);

	cudaThreadSynchronize();

	cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0);

	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);

	glBindTexture(GL_TEXTURE_2D, texid);

	__glewBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
	glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, ncols, nrows, GL_LUMINANCE, GL_FLOAT,0);
	__glewBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

	glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
	glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
	glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
	glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);

	__glewBindBuffer(GL_ARRAY_BUFFER, pbo);  
	__glewClientActiveTexture(GL_TEXTURE0);
	glTexCoordPointer(1, GL_FLOAT, 0, 0);
	glEnableClientState(GL_TEXTURE_COORD_ARRAY);

	__glewUseProgram(shaderProg);
	glLoadIdentity();

	glBegin(GL_QUADS);
	glTexCoord2f(0,0);
	glVertex2f(1,1);
	glTexCoord2f(1,0);
	glVertex2f(0,1);
	glTexCoord2f(1,1);
	glVertex2f(0,0);
	glTexCoord2f(0,1);
	glVertex2f(1,0);
	glEnd();

	glBindTexture(GL_TEXTURE_2D, 0);


	glutSwapBuffers();
	sdkStopTimer(&timerprocessing);
	sdkStopTimer(&timertotal);
	printf("\rTotal: %3.1f ms. Cam to GPU: %3.1f ms. Processing + displaying: %3.1f ms", sdkGetAverageTimerValue(&timertotal),sdkGetAverageTimerValue(&timerextractimages),sdkGetAverageTimerValue(&timerprocessing));
	computeFPS();
	cudaProfilerStop();
}

extracting the images from camera and transferring them from host (h_phasex) to device (d_phasex) memory:


void extract3ImagesFromCam()
{

	if (lDevice.IsConnected() == 1){


		PvResult lResult1 = lPipeline.RetrieveNextBuffer( &lBuffer1, 100, &lOperationResult1 );
		if ( lResult1.IsOK() )
		{
			if ( lOperationResult1.IsOK() )
			{
				// If the buffer contains an image, get ncols and nrows

				if ( lBuffer1->GetPayloadType() == PvPayloadTypeImage )
				{		
					// Get image specific buffer interface
					PvImage *lFinalImage = lBuffer1->GetImage();
					h_phase1 = (unsigned char*)lFinalImage->GetDataPointer();			
				}
			}
			lPipeline.ReleaseBuffer( lBuffer1 );
		}

		PvResult lResult2 = lPipeline.RetrieveNextBuffer( &lBuffer2, 100, &lOperationResult2 );
		if ( lResult2.IsOK() )
		{
			if ( lOperationResult2.IsOK() )
			{
				// If the buffer contains an image, get ncols and nrows

				if ( lBuffer2->GetPayloadType() == PvPayloadTypeImage )
				{		
					// Get image specific buffer interface
					PvImage *lFinalImage = lBuffer2->GetImage();
					h_phase2 = (unsigned char*)lFinalImage->GetDataPointer();			
				}
			}
			lPipeline.ReleaseBuffer( lBuffer2 );
		}

		PvResult lResult3 = lPipeline.RetrieveNextBuffer( &lBuffer3, 100, &lOperationResult3 );
		if ( lResult3.IsOK() )
		{
			if ( lOperationResult3.IsOK() )
			{
				// If the buffer contains an image, get ncols and nrows

				if ( lBuffer3->GetPayloadType() == PvPayloadTypeImage )
				{		
					// Get image specific buffer interface
					PvImage *lFinalImage = lBuffer3->GetImage();
					h_phase3 = (unsigned char*)lFinalImage->GetDataPointer();			
				}
			}
			lPipeline.ReleaseBuffer( lBuffer3 );
		}

		if(!h_phase1 || !h_phase2 || !h_phase3 )
		{
			cout << "Could not open or find the image" << std::endl;
		}

		cudaMemcpy(d_phase1, h_phase1,sizeof(unsigned char)*640*480,cudaMemcpyHostToDevice);
		cudaMemcpy(d_phase2, h_phase2,sizeof(unsigned char)*640*480,cudaMemcpyHostToDevice);
		cudaMemcpy(d_phase3, h_phase3,sizeof(unsigned char)*640*480,cudaMemcpyHostToDevice);


	}
}