lowering draw calls

I am not trying to batch my sprites together to reduce draw calls. I wrote these functions to sort my sprites from back to front based on z index then by the texture id.


static void cg_sprite_swap_func(cg_sprite** a, cg_sprite** b) {
    cg_sprite* tp = *a;
    *a = *b;
    *b = tp;
}

static int greater(cg_sprite*a, cg_sprite*b) {
    if(a->z_index < b->z_index) {
        return 0;
    }
    else if (a->z_index > b->z_index) {
        return 1;
    }
    if(a->texture_id > b->texture_id) {
        return 0;
    }
    else if(a->texture_id < b->texture_id) {
        return 1;
    }
    return -1;
}

void cg_sprite_back_2_front_tex_id(cg_sprite** a, int count) {
    cg_sprite* l, *r;

    for (int i = 0; i < count; i++) {
        int swap = 0;
        for (int j = 0; j < (count - i - 1); j++) {
            if(greater(a[j], a[j+1])) {
                cg_sprite_swap_func(&a[j], &a[j + 1]);
                swap++;
            }
        }
        if (swap == 0) {
            break;
        }
    }
}

I am currently updating my sprites like this:


    cg_sprite_back_2_front_tex_id(sprites, sc);
    for (int i = 0; i < sc; i++) {

        sp = sprites[i];
        vmathT3MakeIdentity(&rot);
        vmathT3MakeIdentity(&scal);
        vmathT3MakeIdentity(&trns);
        vmathT3MakeIdentity(&tmp);

        vmathT3MakeScale(&scal, &sp->scale);
        vmathT3MakeRotationZYX(&rot, &sp->angl);
        vmathT3MakeTranslation(&trns, &sp->pos);
        vmathT3Mul(&tmp, &trns, &scal);  // scale then trnslate
        vmathT3Mul(&tmp, &tmp, &rot);    // scale then translate then rotate

        vmathM4MakeFromT3(&sprites[i]->m_mat, &tmp);

        cg_quad_getquadverts(&sp->iv0, &sp->iv1, &sp->iv2, &sp->iv3, sp->quad);
        vmathM4MulV4(&sp->ov0, &sp->m_mat, &sp->iv0);
        vmathM4MulV4(&sp->ov1, &sp->m_mat, &sp->iv1);
        vmathM4MulV4(&sp->ov2, &sp->m_mat, &sp->iv2);
        vmathM4MulV4(&sp->ov3, &sp->m_mat, &sp->iv3);
    }

after this all the sprites are sorted, updated and transformed on the cpu ready to be sent to the gpu. sc == sprite count. After all that work I fall back to rendering one at a time.


    for (int i = 0; i < sc; i++) {
        sp = sprites[i];
        idx = 0;
        // v0
        v_buff[idx++] = sp->ov0.x;
        v_buff[idx++] = sp->ov0.y;
        v_buff[idx++] = sp->ov0.z;

        v_buff[idx++] = sp->quad->colors[0];
        v_buff[idx++] = sp->quad->colors[1];
        v_buff[idx++] = sp->quad->colors[2];
        v_buff[idx++] = sp->quad->colors[3];

        v_buff[idx++] = sp->quad->tex_coords[0];
        v_buff[idx++] = sp->quad->tex_coords[1];

        // v1
        v_buff[idx++] = sp->ov1.x;
        v_buff[idx++] = sp->ov1.y;
        v_buff[idx++] = sp->ov1.z;

        v_buff[idx++] = sp->quad->colors[4];
        v_buff[idx++] = sp->quad->colors[5];
        v_buff[idx++] = sp->quad->colors[6];
        v_buff[idx++] = sp->quad->colors[7];

        v_buff[idx++] = sp->quad->tex_coords[2];
        v_buff[idx++] = sp->quad->tex_coords[3];

        // v2
        v_buff[idx++] = sp->ov2.x;
        v_buff[idx++] = sp->ov2.y;
        v_buff[idx++] = sp->ov2.z;

        v_buff[idx++] = sp->quad->colors[8];
        v_buff[idx++] = sp->quad->colors[9];
        v_buff[idx++] = sp->quad->colors[10];
        v_buff[idx++] = sp->quad->colors[11];

        v_buff[idx++] = sp->quad->tex_coords[4];
        v_buff[idx++] = sp->quad->tex_coords[5];

        // v3
        v_buff[idx++] = sp->ov3.x;
        v_buff[idx++] = sp->ov3.y;
        v_buff[idx++] = sp->ov3.z;

        v_buff[idx++] = sp->quad->colors[12];
        v_buff[idx++] = sp->quad->colors[13];
        v_buff[idx++] = sp->quad->colors[14];
        v_buff[idx++] = sp->quad->colors[15];

        v_buff[idx++] = sp->quad->tex_coords[6];
        v_buff[idx++] = sp->quad->tex_coords[7];


        glUseProgram(ce_get_default_shader()->shader_program);
        glBindVertexArray(vao);

        glEnable(GL_BLEND);
        glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);

        glEnable(GL_DEPTH_TEST);
        glDepthFunc(GL_LEQUAL);
        glClearDepth(1.0f);

        glActiveTexture(GL_TEXTURE0);
        glUniform1i(tex_loc, 0);
        glBindTexture(GL_TEXTURE_2D, sp->texture_id);

        cg_cam_get_matrices(&v_mat, &p_mat, &mvp_mat, ce_get_default_camera());

        glUniformMatrix4fv(model_mat_loc, 1, GL_FALSE,
                           vmathM4GetData(&sp->m_mat));
        glUniformMatrix4fv(view_mat_loc, 1, GL_FALSE, vmathM4GetData(v_mat));
        glUniformMatrix4fv(proj_mat_loc, 1, GL_FALSE, vmathM4GetData(p_mat));
        glUniformMatrix4fv(mvp_matrix_loc, 1, GL_FALSE,
                           vmathM4GetData(mvp_mat));

        glBindBuffer(GL_ARRAY_BUFFER, vert_buff);
        glBufferData(GL_ARRAY_BUFFER, (vbo_size_in_bytes), v_buff,
                     GL_STREAM_DRAW);

        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, ind_buff);
        glBufferData(GL_ELEMENT_ARRAY_BUFFER, sc * cg_sprite_get_sizeof_ind(),
                     i_buff, GL_STREAM_DRAW);

        glDrawElements(GL_TRIANGLES, sc * cg_sprite_get_vert_count(),
                       GL_UNSIGNED_SHORT, 0);

        glBindVertexArray(0);
        debug_opengl("render loop");
    }

the thing is currently the data is organized in such a way that I should be able to batch them up and render in larger groups than 1 at a time. The basic algorithm I have is, check if current sprites tex_id is the same as the previous texture id. If not send the buffer to the gpu and draw with the current bound texture id. If the texture id’s are the same put the data into the buffer until the texture id’s are no longer the same then render.

Is this the way that geometry batching is implemented?

Are there any examples that anyone can share?

Normally, both the attributes and the indices would be static. Transformation would be performed by the GPU. For 3D geometry, translucent surfaces would be handed by depth peeling rather than back-to-front rendering (correct sorting for 3D geometry is a hard problem). Textures would be coalesced into atlases (possibly using array textures) to avoid needing to split batches.

Essentially, you transfer the data directly from disk to the GPU, then just set “parameters” and perform draw calls each frame.

At that level, it leaves practically nothing for the CPU to do, so you have some scope to perform some of the operations on the CPU.

[QUOTE=GClements;1281215]Normally, both the attributes and the indices would be static. Transformation would be performed by the GPU. For 3D geometry, translucent surfaces would be handed by depth peeling rather than back-to-front rendering (correct sorting for 3D geometry is a hard problem). Textures would be coalesced into atlases (possibly using array textures) to avoid needing to split batches.

Essentially, you transfer the data directly from disk to the GPU, then just set “parameters” and perform draw calls each frame.

At that level, it leaves practically nothing for the CPU to do, so you have some scope to perform some of the operations on the CPU.[/QUOTE]
I might be misunderstanding something here but i’ll take it a bit at a time.

I’d like to create a floor, say OpenGLES 2.0 and OpenGL 330 as the base, and add additional features but to avoid feature creep and to limit myself from constantly adding the shiniest I stick to these versions. I am not doing 3d now, probably not for a while actually so this is for mainly 2D.

the indices don’t change so they can be uploaded once, that part I just figured out and added it to my code.
I do understand texture atlas as well and wrote a bin packer, which would immediately solve this problem of multiple texture problem with batching but am I would like to be able to effectively render without using atlases.

I think what you are referring to in the quote is sending just the vertex data to the gpu and doing

P * V * M * vec3

transformations on the gpu, is that correct? I see people argue about where to do the transformations, etc and yes it would probably be easier to implement on the gpu but actually writing this code on the cpu is really helping me understand what’s going on, instead of just sending numbers to the renderer.

[QUOTE=blubee;1281216]I think what you are referring to in the quote is sending just the vertex data to the gpu and doing

P * V * M * vec3

transformations on the gpu, is that correct?[/QUOTE]
Yes.

In this particular case, that’s problematic because the matrix is per-quad. While there are many ways to deal with that, most of them don’t work with OpenGL ES 2 and the ones which do are inefficient.

[QUOTE=GClements;1281218]Yes.

In this particular case, that’s problematic because the matrix is per-quad. While there are many ways to deal with that, most of them don’t work with OpenGL ES 2 and the ones which do are inefficient.[/QUOTE]

that’s why I am doing those transformations on the cpu and trying to set up a batching system for the quads. hence the original post in this thread.

Well, your case is fairly atypical, in that you seem to be assuming that all geometry can be translucent, and thus needs to be rendered in a specific order. That’s going to have a significant cost in terms of your ability to batch by texture. In the general case, the probability of two consecutive sprites sharing the same texture is 1/N (where N is the number of textures). The probability of three consecutive sprites sharing the same texture is 1/N2. And so on. So unless the Z ordering is chosen with batching by texture in mind, you typically aren’t going to get large batches.

If you’re sorting by Z simply because geometry “might” be translucent, I’d suggest that you don’t do that. Require translucent geometry to be explicitly flagged as such so that it can be processed separately. Non-translucent geometry can be batched by texture, relying upon depth testing for ordering.

For translucent geometry, it may even be worthwhile to perform a topological sort each frame so that you can ignore the relative ordering of sprites which don’t overlap, and batch by texture instead. That would require some computation, and potentially replacing all or part of the index buffer each frame, but would reduce the number of draw calls.

[QUOTE=GClements;1281220]Well, your case is fairly atypical, in that you seem to be assuming that all geometry can be translucent, and thus needs to be rendered in a specific order. That’s going to have a significant cost in terms of your ability to batch by texture. In the general case, the probability of two consecutive sprites sharing the same texture is 1/N (where N is the number of textures). The probability of three consecutive sprites sharing the same texture is 1/N2. And so on. So unless the Z ordering is chosen with batching by texture in mind, you typically aren’t going to get large batches.

If you’re sorting by Z simply because geometry “might” be translucent, I’d suggest that you don’t do that. Require translucent geometry to be explicitly flagged as such so that it can be processed separately. Non-translucent geometry can be batched by texture, relying upon depth testing for ordering.

For translucent geometry, it may even be worthwhile to perform a topological sort each frame so that you can ignore the relative ordering of sprites which don’t overlap, and batch by texture instead. That would require some computation, and potentially replacing all or part of the index buffer each frame, but would reduce the number of draw calls.[/QUOTE]

that’s a lot of good advise. The reason that I am sorting these sprites back to front is because it’s all 2d rendering at the moment. That means I am just rendering alpha cutout quads. Is there a better way to go about this? I know there are order independent rendering techniques but require a higher opengl version than 2.0 they require half floats which are only available in gles 3.0 and higher core profile.

Is there another way to look at this problem of rendering only flat quads with alpha cutout textures?

If you’re using the alpha channel as a binary mask, then you just need alpha testing, i.e. have the fragment shader “discard” the fragment if the texture’s alpha is below some threshold value (the equivalent of glEnable(GL_ALPHA_TEST) in the fixed-function pipeline). Then you can render sprites in any order (e.g. batched by texture) and use the depth buffer for ordering.

I just did a quick test with this and it works pretty well but there was one glaring thing. In the screenshots that I posted earlier there was always one image that has transparency < 1 and > 0

This technique doesn’t work for that but I think that can be remedied by using two different shaders and render all the non translucent quads with that shader although there shouldn’t be too much need for many translucent sprites or at least that gives me a different way to tackle the problem.

Sprites with actual translucency should be rendered last, sorted from back to front, testing against the depth buffer used for the opaque sprites but not writing to it.

If you use a topological sort, you have more scope for batching by texture than if you use a fixed order, at the expense of needing to recalculate the order each frame. Unless the fixed order is strongly correlated with the texture, the reduction in the number of draw calls will almost certainly outweigh the sorting overhead.

Simple alpha blending that e.g. only uses additive, subtract or multiplication, does not need to be sorted. Depending on how accurate the result has to be.