Performance with 3 windows on two monitors - What can I be doing wrong?

I am at the end…

I need to be able to render a 1920x1080 HD video window on a second monitor and two 960x540 video windows on the first monitor all with overlays. Each window has an independent video update rate of 30 Hz. The performance problem comes in when I add the overlays (each independently updating at 10 Hz). I’m using a double buffer PBO scheme with glMapBufferRange() to allow for asynchronous updates of video and overlay frames. I suspect that the bottleneck has to do with the blending an/or rendering of the ABGR overlay, but I am hoping someone can point out obvious things that I am doing wrong, I’ll attach code… Note, this will likely not compile, but should be enough to give someone a good idea as to what I could be doing wrong or could do better. Thanks!

Having trouble adding the code, so I’ll try to do it in sections:



  class Mutex {
  public:
    Mutex() { pthread_mutex_init(&mMutex, NULL); }
    virtual ~Mutex() { pthread_mutex_destroy(&mMutex); }
    virtual bool Lock() { return (0 == pthread_mutex_lock(&mMutex)); }
    virtual void Unlock() { pthread_mutex_unlock(&mMutex); }
    
  protected:
    pthread_mutex_t mMutex;
  };
  
  class TryMutex : public Mutex {
  public:
    TryMutex() { }
    virtual ~TryMutex() { }
    virtual bool Lock() { return (0 == pthread_mutex_trylock(&mMutex)); }
  };
  
  class TimedMutex : public Mutex {
  public:
    TimedMutex(uint32_t aWaitSecs, uint64_t aWaitMicrosSecs) :
      mWaitSeconds(aWaitSecs), mWaitMicroSeconds(aWaitMicrosSecs) { }
    virtual ~TimedMutex() { }
    virtual bool Lock() {
      struct timespec tTimeout;
      clock_gettime(CLOCK_REALTIME, &tTimeout);
      
      tTimeout.tv_sec += mWaitSeconds;
      tTimeout.tv_nsec += mWaitMicroSeconds * 1000;
      
      return (0 == pthread_mutex_timedlock(&mMutex, &tTimeout));
    }
    
  private:
    uint32_t mWaitSeconds;
    uint64_t mWaitMicroSeconds;
  };




  class GLBufferObject {
  public:
    GLBufferObject(Mutex *aMutexImpl, unsigned int aBytesPerPixel = 3) {
      mUpdated(false),
      mSizeChanged(false),
      mSize(0),
      mWidth(0),
      mHeight(0),
      mCurBuffer(0),
      mNumBuffered(0),
      mBytesPerPixel(aBytesPerPixel),
      mPixelMap(NULL),
      mFormat((3 == aBytesPerPixel) ? GL_RGB : GL_ABGR_EXT),
      mInternalFormat((3 == aBytesPerPixel) ? GL_RGB : GL_RGBA),
      mTextureId(0),
      mMutex(aMutexImpl) {
    }

    virtual ~GLBufferObject() {
      if (mMutex) {
        delete mMutex;
      }
    }
    
    void Destroy() {
      if (mMutex->Lock()) {
        if (mPixelMap) {
          glBindBuffer(GL_PIXEL_UNPACK_BUFFER, mBufferId[mCurBuffer]);
          glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
          glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
        }
        mMutex->Unlock();
      }
  
      glDeleteBuffers(2, mBufferId);
      glDeleteTextures(1, &mTextureId);
    }

    void Initialize() {
      glGenTextures(1, &mTextureId);
      glBindTexture(GL_TEXTURE_2D, mTextureId);
      glTexImage2D(GL_TEXTURE_2D, 0, mInternalFormat, GLCANVAS_MAX_WIDTH, GLCANVAS_MAX_HEIGHT, 0, mFormat, GL_UNSIGNED_BYTE, 0);
      glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
      glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
      glBindTexture(GL_TEXTURE_2D, 0);

      glGenBuffers(2, mBufferId);

      for (int i=0; i<2; ++i) {
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, mBufferId[i]);
        glBufferData(GL_PIXEL_UNPACK_BUFFER, (GLCANVAS_MAX_WIDTH * GLCANVAS_MAX_HEIGHT * mBytesPerPixel), NULL, GL_DYNAMIC_COPY);

        if (0 == i) {
          mPixelMap = glMapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, (GLCANVAS_MAX_WIDTH * GLCANVAS_MAX_HEIGHT * mBytesPerPixel), MAP_BUFFER_OPTIONS);
        }

        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
      }
    }

    bool Swap() {
      bool tResult = false;

      if (mMutex->Lock()) {
        tResult = mUpdated;

        if (mUpdated) {
          glBindBuffer(GL_PIXEL_UNPACK_BUFFER, mBufferId[mCurBuffer]);
          glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
          glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

          mUpdated = false;
          mPixelMap = NULL;
          mCurBuffer = !mCurBuffer;
          ++mNumBuffered;

          glBindBuffer(GL_PIXEL_UNPACK_BUFFER, mBufferId[mCurBuffer]);

          if (mNumBuffered > 1) {
            --mNumBuffered;

            glBindTexture(GL_TEXTURE_2D, mTextureId);

            if (!mSizeChanged) {
              glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, mWidth, mHeight, mFormat, GL_UNSIGNED_BYTE, 0);
            } else {
              printf("GLCanvas::GLBufferObject::Swap() - Input size changed %ux%u
", mWidth, mHeight);

              mSizeChanged = false;
              glTexImage2D(GL_TEXTURE_2D, 0, mInternalFormat, mWidth, mHeight, 0, mFormat, GL_UNSIGNED_BYTE, 0);
              glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
              glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
            }

            glBindTexture(GL_TEXTURE_2D, 0);
          }

          // Use the "orphaning" technique to help prevent overwrite/tearing
          glBufferData(GL_PIXEL_UNPACK_BUFFER, (GLCANVAS_MAX_WIDTH * GLCANVAS_MAX_HEIGHT * mBytesPerPixel), NULL, GL_DYNAMIC_COPY);
          mPixelMap = glMapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, mSize, MAP_BUFFER_OPTIONS);

          glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
        }

        mMutex->Unlock();
      } else {
        printf("GLCanvas::GLBufferObject::Swap() - Failed to acquire lock for update
");
      }

      return tResult;
    }

    bool Update(const void *aPixels, uint32_t aWidth, uint32_t aHeight) {
      bool tResult = false;

      if (mMutex->Lock()) {
        if (mPixelMap) {
          mSize = aWidth * aHeight * mBytesPerPixel;

          memcpy(mPixelMap, aPixels, mSize);

          if (mHeight != aHeight || mWidth != aWidth) {
            mSizeChanged = true;
          }

          mHeight = aHeight;
          mWidth = aWidth;
          mUpdated = true;
          tResult = true;
        } else {
          printf("GLCanvas::GLBufferObject::Update() pixels are not mapped!
");
        }
        mMutex->Unlock();
      }

      return tResult;
    }
  
    bool mUpdated;
    bool mSizeChanged;
    unsigned int mSize;
    unsigned int mWidth;
    unsigned int mHeight;
    unsigned int mCurBuffer;
    unsigned int mNumBuffered;
    unsigned int mBytesPerPixel;
    GLvoid *mPixelMap;
    GLint mFormat;
    GLint mInternalFormat;
    GLuint mTextureId;
    GLuint mBufferId[2];
    Mutex *mMutex;
  };




#define GLCANVAS_MAX_HEIGHT 1080
#define GLCANVAS_MAX_WIDTH 1920
#define MAP_BUFFER_OPTIONS (GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_UNSYNCHRONIZED_BIT)

#include <GL/glx.h>

class GLCanvas {
public:

  /**
   *
   */
  GLCanvas() {
  mRendering(false),
  mInitialized(false),
  mCanvasSizeChanged(true),
  mContext(NULL),
  mInputBuffer(new Mutex()),
  mOverlayBuffer(new TryMutex(), 4) {
    pthread_mutex_init(&mWaitMutex, NULL);
    pthread_cond_init(&mWaitCondition, NULL);
  }

  /**
   *
   */
  virtual ~GLCanvas() {
    pthread_cond_destroy(&mWaitCondition);
    pthread_mutex_destroy(&mWaitMutex);
  }

  /**
   * Sets the Display and Drawable parameters for the associated window
   * @param aDrawable
   */
  virtual void setDrawable(Display aDisplay, Drawable aDrawable) {

    if (0 == pthread_mutex_lock(&sMutex)) {
      mDisplay = aDisplay;
      mContext = createScreenCorrectContext(aDrawable);

      if (mContext) {
        mDrawable = aDrawable;

        if (glXIsDirect(mDisplay, mContext)) {
          printf("Direct rendering supported
");
          mRendering = true;
          mRenderThread.start();
        } else {
          printf("GLCanvas::setDrawable() - Direct rendering unsupported
");
        }
      } else {
        printf("GLCanvas::setDrawable() - Failed to create GL context!
");
      }
      
      pthread_mutex_unlock(&sMutex);
    }
  }

  /**
   * Update the video frame pixels - expected to be in RGB
   * @param aPixels
   * @param aWidth
   * @param aHeight
   */
  virtual void setFrame(const void *aPixels, int aWidth, int aHeight) {
    if (!mInputBuffer.Update(aData, tDesc.getWidth(), tDesc.getHeight())) {
      printf("GLCanvas::setFrame() - dropping frame!
");
    }
  }

  /**
   * Update the overlay pixels - expected to be in ABGR
   * @param aPixels
   * @param aWidth
   * @param aHeight
   */
  virtual void setOverlay(const void *aPixels, int aWidth, int aHeight) {
    mOverlayBuffer.Update(aPixels, aWidth, aHeight);
  }

  /**
   * Set the size of the canvas
   * @param aWidth
   * @param aHeight
   */
  virtual void setSize(int aWidth, int aHeight) {
    mCanvasWidth = aWidth;
    mCanvasHeight = aHeight;
    mCanvasSizeChanged = true;
  }

  /**
   * Set the rendering coordinates of the video within the canvas
   * @param aX0
   * @param aY0
   * @param aX1
   * @param aY1
   */
  virtual void setRenderingCoordinates(int aX0, int aY0, int aX1, int aY1) {
    mX0 = aX0;
    mY0 = aY0;
    mX1 = aX1;
    mY1 = aY1;
  }

  private:
  
  void idle() {
    mInputBuffer.Swap();
    mOverlayBuffer.Swap();
  }

  bool init() {
    if (!mInitialized) {
      glClearColor(0.0f, 0.0f, 0.0f, 0.0f);
      glClearDepth(1.0f);

      glEnable(GL_TEXTURE_2D);
      glEnable(GL_ALPHA_TEST);
      glAlphaFunc(GL_GREATER, 0.5f);

      mInputBuffer.Initialize(); // RGB
      mOverlayBuffer.Initialize(); // RGBA

      mInitialized = true;
    }

    return mInitialized;
  }

  void dispose() {
    mInputBuffer.Destroy();
    mOverlayBuffer.Destroy();
  }

  bool render() {
    bool tResult = true;
  
    /*
     * Clearing is needed because of changes between HD -> SD and canvas resize
     */
    if (mCanvasSizeChanged) {
      mCanvasSizeChanged = false;
    
      glMatrixMode(GL_PROJECTION);
      glLoadIdentity();
      glOrtho(0.0f, (GLdouble) mCanvasWidth, 0.0f, (GLdouble) mCanvasHeight, -1.0f, 1.0f);
      glViewport(0, 0, mCanvasWidth, mCanvasHeight);
//      glHint(GL_POLYGON_SMOOTH_HINT, GL_NICEST);
      glHint(GL_PERSPECTIVE_CORRECTION_HINT, GL_FASTEST);
      glMatrixMode(GL_MODELVIEW);
    }

    glClear(GL_COLOR_BUFFER_BIT);

    /*
     * Render the input video texture
     */
    glBindTexture(GL_TEXTURE_2D, mInputBuffer.mTextureId);

    if (0 == mX1 || 0 == mY1) {
      p”rintf(setRenderingCoordinates() wasn't called, defaulting to canvas size!
”);
      mX1 = mCanvasWidth;
      mY1 = mCanvasHeight;
    }

    glBegin(GL_QUADS);
      glTexCoord2i(0, 1); glVertex2i(mX0, mY0);
      glTexCoord2i(1, 1); glVertex2i(mX1, mY0);
      glTexCoord2i(1, 0); glVertex2i(mX1, mY1);
      glTexCoord2i(0, 0); glVertex2i(mX0, mY1);
    glEnd();

    glBindTexture(GL_TEXTURE_2D, 0);

    /*
     * Render the overlay
     */
    if (mOverlayBuffer.mWidth > 0) {
      glBindTexture(GL_TEXTURE_2D, mOverlayBuffer.mTextureId);
    
      glBegin(GL_QUADS);
        glTexCoord2i(0, 1); glVertex2i(0, 0);
        glTexCoord2i(1, 1); glVertex2i(mCanvasWidth, 0);
        glTexCoord2i(1, 0); glVertex2i(mCanvasWidth, mCanvasHeight);
        glTexCoord2i(0, 0); glVertex2i(0, mCanvasHeight);
      glEnd();

      glBindTexture(GL_TEXTURE_2D, 0);
    }

    glXSwapBuffers(mDisplay, mDrawable);
  
    return tResult;
  }

  uint64_t getTime() {
    struct timeval tNow;
    gettimeofday(&tNow, NULL);
    return ((uint64_t)tNow.tv_sec * 1000000 + tNow.tv_usec);
  }

  GLXContext createScreenCorrectContext(Drawable aDrawable, GLXContext aSharedCtx = NULL) {
    GLXContext tContext = NULL;
    GLint tAttributes[] = {GLX_RGBA, GLX_DEPTH_SIZE, 24, GLX_DOUBLEBUFFER, None};
    XVisualInfo *tVisualInfo = glXChooseVisual(mDisplay, XDefaultScreen(mDisplay), tAttributes);

    if (tVisualInfo) {
      tContext = glXCreateContext(mDisplay, tVisualInfo, aSharedCtx, GL_TRUE);;

      if (tContext) {
        if (XScreenCount(mDisplay) > 1) {
          int tResult = glXMakeCurrent(mDisplay, aDrawable, tContext);

          // When failure to make current, probably on the wrong screen
          if (GL_TRUE != tResult) {
            Display *tDisplay = XOpenDisplay(":0.1");

            printf("GLCanvas::createScreenCorrectContext() - glXMakeCurrent() failed on multi-screen display, trying other screen
");

            if (tDisplay) {
              // Destroy the context for the default display
              glXDestroyContext(tDisplay, tContext);
              tContext = NULL;

              // Currently only support screen 1 TI12/TI14
              tVisualInfo = glXChooseVisual(tDisplay, 1, tAttributes);

              if (tVisualInfo) {
                tContext = glXCreateContext(tDisplay, tVisualInfo, aSharedCtx, GL_TRUE);

                if (tContext) {
                  setDisplay(tDisplay);
                } else {
                  printf("GLCanvas::createScreenCorrectContext() - glXCreateContext() for screen 1 failed!
");
                }
              } else {
                printf("GLCanvas::createScreenCorrectContext() - glXChooseVisual() for screen 1 failed!
");
              }
            } else {
              printf("GLCanvas::createScreenCorrectContext() - XOpenDisplay(:0.1) failed!
");
            }
          } else {
            // Ensure to undo the successful glXMakeCurrent()
            glXMakeCurrent(mDisplay, None, NULL);
          }
        } else {
          printf("GLCanvas::createScreenCorrectContext() - detected a single screen
");
        }
      } else {
        printf("GLCanvas::createScreenCorrectContext() - glXCreateContext() failed!
");
      }
    } else {
      printf("GLCanvas::createScreenCorrectContext() - glXChooseVisual() for default screen failed!
");
    }

    return tContext;
  }

  virtual void start() {
   printf("GLCanvas::start() starting render thread...
");

    if (GL_TRUE == glXMakeCurrent(mDisplay, mDrawable, mContext)) {

      if (init()) {
        uint64_t tStop, tStart;

        while (mRendering) {
          render();

          tStart = getTime();
          idle();
          tStop = getTime();

          /*
           * Assuming a 60 Hz refresh, there should be about 16 ms of idle
           * time to update overlays and video input, report error when the idle
           * operations take more than 90% of this allowed time...
           */
          if (tStop - tStart > 14400) {
            printf("GLCanvas::start(%x) - idle() time: %lu taking more than 90%% of available time
", (int)mDrawable, tStop - tStart);
          }
        }

        dispose();
      } else {
        printf("GLCanvas::start() - Failed to initialize GL parameters
");
      }

      glXMakeCurrent(mDisplay, None, NULL);
    } else {
      printf("GLCanvas::start() - Failed to make context current
");
    }

    printf("GLCanvas::start() thread ended...
");
  }

  static pthread_mutex_t sMutex = PTHREAD_MUTEX_INITIALIZER; 
  
  Thread mRenderThread;
  bool mRendering;
  bool mInitialized;
  bool mCanvasSizeChanged;
  GLXContext mContext;
  pthread_mutex_t mWaitMutex;
  pthread_cond_t mWaitCondition;
  GLBufferObject mInputBuffer;
  GLBufferObject mOverlayBuffer;

};


Help! - I have this block of code to show what I am doing, but the forum is denying me the ability to post it…

Have you run a profiler. It will give you clues to where you are spending your time.

I tried gDEBugger, but it didn’t really help. Can you suggest a better tool that will show timing on the OpenGL calls?

Should I assume that the code I posted is correct in regards to OpenGL usage and can’t be optimized?

The profilers I use:
https://code.google.com/p/shinyprofiler/
http://www.codersnotes.com/sleepy

To measure certain GL calls …use a performance counter. Use glFinish to isolate the GL call.

[QUOTE=Aliii;1258047]The profilers I use:
https://code.google.com/p/shinyprofiler/
http://www.codersnotes.com/sleepy

To measure certain GL calls …use a performance counter. Use glFinish to isolate the GL call.[/QUOTE]

Just noting, but I’m pretty sure that my issue is no way CPU limited, the CPU usage of the application is < 50% of a single CPU and I have 12 cores.

Might it be that you are using VSync on those windows? Because if you do that would probably be the reason.

Yes, definately using VSYNC on all windows and both screens. I need VSYNC to avoid video tearing. I’m not sure how VSYNC would be a problem. I try and do all CPU based operations in the Swap() methods immediately after a glSwapBuffers() which should give me ~ 16 ms to do what is needed. The problem is that sometimes the GL calls in the Swap() method take way longer which ends up looking like jitter in the video. It is almost like the different canvas(s) are fighting each other, but I can’t tell how/why.

If you have several windows and they all try to synchronize with the screen they will enter a race and potentially slow each other down.
Just try once without VSync and see if the problem still occurs.

[QUOTE=Cornix;1258057]If you have several windows and they all try to synchronize with the screen they will enter a race and potentially slow each other down.
Just try once without VSync and see if the problem still occurs.[/QUOTE]

So yes, the jitter/stutter is much better with VSYNC off - I get it. But what I think is being said… there is no best of both worlds with OpenGL, you can either have smooth looking video with tearing, or non-tearing video with high jitter - not smooth video without tearing.

I’m starting to think that using OpenGL to render video is a really bad idea. Should I be using something else in the Linux world, maybe VDPAU or will I have similiar issues there too?

This is not a problem with OpenGL, this is a problem with VSync in general.
This is just what it does. By definition.

Read up on VSync some more, on wikipedia or somewhere, this effect is very well documented and explained throughout the internet.

I’m not sure I agree completely. The issue might not be with OpenGL, but with how vendors implement OpenGL with their drivers. The whole idea of having a front and back buffer should be enough to prevent tearing when VSync is disabled, but the driver implementations allow for the back buffer to be modified during the buffer swap, introducing tearing. I don’t exactly know how long the refresh takes (probably depends on display HW), but assuming it is only a few milliseconds, it would make it much easier on the developer to lock out the back buffer during the transition to the front for the few milliseconds and making the VSync option a bit more transparent to the developer. Makes sense to me anyways, probably much more involved/complicated or nVidia/Intel/ATI would have already solved this…

I appreciate eveyones help. I guess the code that I posted is mostly correct, as nobody seemed to point out incorrect OpenGL usage or better ways of doing things. I think for now, I may try and serialize the pixel uploads for the two windows on the same screen by waiting until just after the VSync in hopes that it will correct the fighting window issues.