/**
 * Copyright (c) 2020-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

/**
 * There are two threads in the optimized code. input thread and Processing thread.
 * The pre-procesing as required by the algorithm like scaling and color
 * conversion of data is done in input thread. This is done using NvBufSurfTransform's
 * batch conversion APIs to improve performance. The processing of data using custom
 * algorithm and parsing the output and  metadata attachment is done in separate processing
 * thread.
 *
 * There are two queues used for buffering and transferring data between thread:
 * Process_queue and buf_queue Process_queue is used to send filled batched data to
 * process thread and buf_queue is used to get return empty processed buffers from
 * process thread to input thread.  Two buffers are used in a ping pong manner between
 * the two threads for parallel processing.
 */

#include <string.h>
#include <string>
#include <sstream>
#include <iostream>
#include <ostream>
#include <fstream>

#include "gstdsexample_optimized.h"

#include <sys/time.h>
#include <condition_variable>
#include <mutex>
#include <thread>

GST_DEBUG_CATEGORY_STATIC (gst_dsexample_debug);
#define GST_CAT_DEFAULT gst_dsexample_debug
#define USE_EGLIMAGE 1

#ifdef WITH_OPENCV
//enable to write transformed cvmat to files
//#define DSEXAMPLE_DEBUG
#ifdef DSEXAMPLE_DEBUG
#include "opencv2/imgcodecs.hpp"
#endif
#endif

static GQuark _dsmeta_quark = 0;

/* Enum to identify properties */
enum
{
  PROP_0,
  PROP_UNIQUE_ID,
  PROP_PROCESSING_WIDTH,
  PROP_PROCESSING_HEIGHT,
  PROP_PROCESS_FULL_FRAME,
  PROP_BATCH_SIZE,
  PROP_GPU_DEVICE_ID
};

#define CHECK_NVDS_MEMORY_AND_GPUID(object, surface)  \
  ({ int _errtype=0;\
   do {  \
    if ((surface->memType == NVBUF_MEM_DEFAULT || surface->memType == NVBUF_MEM_CUDA_DEVICE) && \
        (surface->gpuId != object->gpu_id))  { \
    GST_ELEMENT_ERROR (object, RESOURCE, FAILED, \
        ("Input surface gpu-id doesnt match with configured gpu-id for element," \
         " please allocate input using unified memory, or use same gpu-ids"),\
        ("surface-gpu-id=%d,%s-gpu-id=%d",surface->gpuId,GST_ELEMENT_NAME(object),\
         object->gpu_id)); \
    _errtype = 1;\
    } \
    } while(0); \
    _errtype; \
  })


/* Default values for properties */
#define DEFAULT_UNIQUE_ID 15
#define DEFAULT_PROCESSING_WIDTH 640
#define DEFAULT_PROCESSING_HEIGHT 480
#define DEFAULT_PROCESS_FULL_FRAME TRUE
#define DEFAULT_GPU_ID 0
#define DEFAULT_BATCH_SIZE 1

#define RGB_BYTES_PER_PIXEL 3
#define RGBA_BYTES_PER_PIXEL 4
#define Y_BYTES_PER_PIXEL 1
#define UV_BYTES_PER_PIXEL 2

#define MIN_INPUT_OBJECT_WIDTH 16
#define MIN_INPUT_OBJECT_HEIGHT 16

#define CHECK_NPP_STATUS(npp_status,error_str) do { \
  if ((npp_status) != NPP_SUCCESS) { \
    g_print ("Error: %s in %s at line %d: NPP Error %d\n", \
        error_str, __FILE__, __LINE__, npp_status); \
    goto error; \
  } \
} while (0)

#define CHECK_CUDA_STATUS(cuda_status,error_str) do { \
  if ((cuda_status) != cudaSuccess) { \
    g_print ("Error: %s in %s at line %d (%s)\n", \
        error_str, __FILE__, __LINE__, cudaGetErrorName(cuda_status)); \
    goto error; \
  } \
} while (0)

/* By default NVIDIA Hardware allocated memory flows through the pipeline. We
 * will be processing on this type of memory only. */
#define GST_CAPS_FEATURE_MEMORY_NVMM "memory:NVMM"
static GstStaticPadTemplate gst_dsexample_sink_template =
GST_STATIC_PAD_TEMPLATE ("sink",
    GST_PAD_SINK,
    GST_PAD_ALWAYS,
    GST_STATIC_CAPS (GST_VIDEO_CAPS_MAKE_WITH_FEATURES
        (GST_CAPS_FEATURE_MEMORY_NVMM,
            "{ NV12, RGBA, I420 }")));

static GstStaticPadTemplate gst_dsexample_src_template =
GST_STATIC_PAD_TEMPLATE ("src",
    GST_PAD_SRC,
    GST_PAD_ALWAYS,
    GST_STATIC_CAPS (GST_VIDEO_CAPS_MAKE_WITH_FEATURES
        (GST_CAPS_FEATURE_MEMORY_NVMM,
            "{ NV12, RGBA, I420 }")));

/* Define our element type. Standard GObject/GStreamer boilerplate stuff */
#define gst_dsexample_parent_class parent_class
G_DEFINE_TYPE (GstDsExample, gst_dsexample, GST_TYPE_BASE_TRANSFORM);

static void gst_dsexample_set_property (GObject * object, guint prop_id,
    const GValue * value, GParamSpec * pspec);
static void gst_dsexample_get_property (GObject * object, guint prop_id,
    GValue * value, GParamSpec * pspec);

static gboolean gst_dsexample_set_caps (GstBaseTransform * btrans,
    GstCaps * incaps, GstCaps * outcaps);
static gboolean gst_dsexample_start (GstBaseTransform * btrans);
static gboolean gst_dsexample_stop (GstBaseTransform * btrans);

static GstFlowReturn
gst_dsexample_submit_input_buffer (GstBaseTransform * btrans,
    gboolean discont, GstBuffer * inbuf);
static GstFlowReturn
gst_dsexample_generate_output (GstBaseTransform * btrans, GstBuffer ** outbuf);

static void
attach_metadata_full_frame (GstDsExample * dsexample,
    NvDsFrameMeta * frame_meta, gdouble scale_ratio, DsExampleOutput * output,
    guint batch_id);
static void attach_metadata_object (GstDsExample * dsexample,
    NvDsObjectMeta * obj_meta, DsExampleOutput * output);

static gpointer gst_dsexample_output_loop (gpointer data);

/* Install properties, set sink and src pad capabilities, override the required
 * functions of the base class, These are common to all instances of the
 * element.
 */
static void
gst_dsexample_class_init (GstDsExampleClass * klass)
{
  GObjectClass *gobject_class;
  GstElementClass *gstelement_class;
  GstBaseTransformClass *gstbasetransform_class;

  // Indicates we want to use DS buf api
  g_setenv ("DS_NEW_BUFAPI", "1", TRUE);

  gobject_class = (GObjectClass *) klass;
  gstelement_class = (GstElementClass *) klass;
  gstbasetransform_class = (GstBaseTransformClass *) klass;

  /* Overide base class functions */
  gobject_class->set_property = GST_DEBUG_FUNCPTR (gst_dsexample_set_property);
  gobject_class->get_property = GST_DEBUG_FUNCPTR (gst_dsexample_get_property);

  gstbasetransform_class->set_caps = GST_DEBUG_FUNCPTR (gst_dsexample_set_caps);
  gstbasetransform_class->start = GST_DEBUG_FUNCPTR (gst_dsexample_start);
  gstbasetransform_class->stop = GST_DEBUG_FUNCPTR (gst_dsexample_stop);

  gstbasetransform_class->submit_input_buffer =
      GST_DEBUG_FUNCPTR (gst_dsexample_submit_input_buffer);
  gstbasetransform_class->generate_output =
      GST_DEBUG_FUNCPTR (gst_dsexample_generate_output);

  /* Install properties */
  g_object_class_install_property (gobject_class, PROP_UNIQUE_ID,
      g_param_spec_uint ("unique-id",
          "Unique ID",
          "Unique ID for the element. Can be used to identify output of the"
          " element", 0, G_MAXUINT, DEFAULT_UNIQUE_ID, (GParamFlags)
          (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));

  g_object_class_install_property (gobject_class, PROP_PROCESSING_WIDTH,
      g_param_spec_int ("processing-width",
          "Processing Width",
          "Width of the input buffer to algorithm",
          1, G_MAXINT, DEFAULT_PROCESSING_WIDTH, (GParamFlags)
          (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));

  g_object_class_install_property (gobject_class, PROP_PROCESSING_HEIGHT,
      g_param_spec_int ("processing-height",
          "Processing Height",
          "Height of the input buffer to algorithm",
          1, G_MAXINT, DEFAULT_PROCESSING_HEIGHT, (GParamFlags)
          (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));

  g_object_class_install_property (gobject_class, PROP_PROCESS_FULL_FRAME,
      g_param_spec_boolean ("full-frame",
          "Full frame",
          "Enable to process full frame or disable to process objects detected"
          "by primary detector", DEFAULT_PROCESS_FULL_FRAME, (GParamFlags)
          (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));

  g_object_class_install_property (gobject_class, PROP_BATCH_SIZE,
      g_param_spec_uint ("batch-size", "Batch Size",
          "Maximum batch size for processing",
          1, NVDSEXAMPLE_MAX_BATCH_SIZE, DEFAULT_BATCH_SIZE,
          (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS |
              GST_PARAM_MUTABLE_READY)));

  g_object_class_install_property (gobject_class, PROP_GPU_DEVICE_ID,
      g_param_spec_uint ("gpu-id",
          "Set GPU Device ID",
          "Set GPU Device ID", 0,
          G_MAXUINT, 0,
          GParamFlags
          (G_PARAM_READWRITE |
              G_PARAM_STATIC_STRINGS | GST_PARAM_MUTABLE_READY)));
  /* Set sink and src pad capabilities */
  gst_element_class_add_pad_template (gstelement_class,
      gst_static_pad_template_get (&gst_dsexample_src_template));
  gst_element_class_add_pad_template (gstelement_class,
      gst_static_pad_template_get (&gst_dsexample_sink_template));

  /* Set metadata describing the element */
  gst_element_class_set_details_simple (gstelement_class,
      "DsExample plugin",
      "DsExample Plugin",
      "Process a 3rdparty example algorithm on objects / full frame",
      "NVIDIA Corporation. Post on Deepstream for Tesla forum for any queries "
      "@ https://devtalk.nvidia.com/default/board/209/");
}

static void
gst_dsexample_init (GstDsExample * dsexample)
{
  GstBaseTransform *btrans = GST_BASE_TRANSFORM (dsexample);

  /* We will not be generating a new buffer. Just adding / updating
   * metadata. */
  gst_base_transform_set_in_place (GST_BASE_TRANSFORM (btrans), TRUE);
  /* We do not want to change the input caps. Set to passthrough. transform_ip
   * is still called. */
  gst_base_transform_set_passthrough (GST_BASE_TRANSFORM (btrans), TRUE);

  /* Initialize all property variables to default values */
  dsexample->unique_id = DEFAULT_UNIQUE_ID;
  dsexample->processing_width = DEFAULT_PROCESSING_WIDTH;
  dsexample->processing_height = DEFAULT_PROCESSING_HEIGHT;
  dsexample->process_full_frame = DEFAULT_PROCESS_FULL_FRAME;
  dsexample->gpu_id = DEFAULT_GPU_ID;
  dsexample->max_batch_size = DEFAULT_BATCH_SIZE;
  /* This quark is required to identify NvDsMeta when iterating through
   * the buffer metadatas */
  if (!_dsmeta_quark)
    _dsmeta_quark = g_quark_from_static_string (NVDS_META_STRING);
}

/* Function called when a property of the element is set. Standard boilerplate.
 */
static void
gst_dsexample_set_property (GObject * object, guint prop_id,
    const GValue * value, GParamSpec * pspec)
{
  GstDsExample *dsexample = GST_DSEXAMPLE (object);
  switch (prop_id) {
    case PROP_UNIQUE_ID:
      dsexample->unique_id = g_value_get_uint (value);
      break;
    case PROP_PROCESSING_WIDTH:
      dsexample->processing_width = g_value_get_int (value);
      break;
    case PROP_PROCESSING_HEIGHT:
      dsexample->processing_height = g_value_get_int (value);
      break;
    case PROP_PROCESS_FULL_FRAME:
      dsexample->process_full_frame = g_value_get_boolean (value);
      break;
    case PROP_GPU_DEVICE_ID:
      dsexample->gpu_id = g_value_get_uint (value);
      break;
    case PROP_BATCH_SIZE:
      dsexample->max_batch_size = g_value_get_uint (value);
      break;
    default:
      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
      break;
  }
}

/* Function called when a property of the element is requested. Standard
 * boilerplate.
 */
static void
gst_dsexample_get_property (GObject * object, guint prop_id,
    GValue * value, GParamSpec * pspec)
{
  GstDsExample *dsexample = GST_DSEXAMPLE (object);

  switch (prop_id) {
    case PROP_UNIQUE_ID:
      g_value_set_uint (value, dsexample->unique_id);
      break;
    case PROP_PROCESSING_WIDTH:
      g_value_set_int (value, dsexample->processing_width);
      break;
    case PROP_PROCESSING_HEIGHT:
      g_value_set_int (value, dsexample->processing_height);
      break;
    case PROP_PROCESS_FULL_FRAME:
      g_value_set_boolean (value, dsexample->process_full_frame);
      break;
    case PROP_GPU_DEVICE_ID:
      g_value_set_uint (value, dsexample->gpu_id);
      break;
    case PROP_BATCH_SIZE:
      g_value_set_uint (value, dsexample->max_batch_size);
      break;
    default:
      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
      break;
  }
}

/**
 * Initialize all resources and start the process thread
 */
static gboolean
gst_dsexample_start (GstBaseTransform * btrans)
{
  GstDsExample *dsexample = GST_DSEXAMPLE (btrans);
  std::string nvtx_str;
#ifdef WITH_OPENCV
  // OpenCV mat containing RGB data
  cv::Mat * cvmat;
#else
  NvBufSurface * inter_buf;
#endif
  NvBufSurfaceCreateParams create_params;
  DsExampleInitParams init_params =
      { dsexample->processing_width, dsexample->processing_height,
    dsexample->process_full_frame };

  /* Algorithm specific initializations and resource allocation. */
  dsexample->dsexamplelib_ctx = DsExampleCtxInit (&init_params);

  GST_DEBUG_OBJECT (dsexample, "ctx lib %p \n", dsexample->dsexamplelib_ctx);

  nvtx_str = "GstNvDsExample: UID=" + std::to_string(dsexample->unique_id);
  auto nvtx_deleter = [](nvtxDomainHandle_t d) { nvtxDomainDestroy (d); };
  std::unique_ptr<nvtxDomainRegistration, decltype(nvtx_deleter)> nvtx_domain_ptr (
      nvtxDomainCreate(nvtx_str.c_str()), nvtx_deleter);

  CHECK_CUDA_STATUS (cudaSetDevice (dsexample->gpu_id),
      "Unable to set cuda device");

  CHECK_CUDA_STATUS (cudaStreamCreate (&dsexample->cuda_stream),
      "Could not create cuda stream");

#ifdef WITH_OPENCV
  if (dsexample->inter_buf)
    NvBufSurfaceDestroy (dsexample->inter_buf);
  dsexample->inter_buf = NULL;
#endif

  /* An intermediate buffer for NV12/RGBA to BGR conversion  will be
   * required. Can be skipped if custom algorithm can work directly on NV12/RGBA. */
  create_params.gpuId = dsexample->gpu_id;
  create_params.width = dsexample->processing_width;
  create_params.height = dsexample->processing_height;
  create_params.size = 0;
  create_params.colorFormat = NVBUF_COLOR_FORMAT_RGBA;
  create_params.layout = NVBUF_LAYOUT_PITCH;
#ifdef __aarch64__
  create_params.memType = NVBUF_MEM_DEFAULT;
#else
  create_params.memType = NVBUF_MEM_CUDA_UNIFIED;
#endif

#ifdef WITH_OPENCV
  if (NvBufSurfaceCreate (&dsexample->inter_buf, dsexample->max_batch_size,
          &create_params) != 0) {
    GST_ERROR ("Error: Could not allocate internal buffer for dsexample");
    goto error;
  }
#endif

  /* Create process queue and cvmat queue to transfer data between threads.
   * We will be using this queue to maintain the list of frames/objects
   * currently given to the algorithm for processing. */
  dsexample->process_queue = g_queue_new ();
  dsexample->buf_queue = g_queue_new ();

#ifdef WITH_OPENCV
  /* Push cvmat buffer twice on the buf_queue which will handle the
   * different processing speed between input thread and process thread
   * cvmat queue is used for getting processed data from the process thread*/
  for (int i = 0; i < 2; i++) {
    // CV Mat containing interleaved RGB data.
    cvmat = new cv::Mat[dsexample->max_batch_size];

    for (guint j = 0; j < dsexample->max_batch_size; j++) {
      cvmat[j] =
          cv::Mat (dsexample->processing_height, dsexample->processing_width,
          CV_8UC3);
    }

    if (!cvmat)
      goto error;

    g_queue_push_tail (dsexample->buf_queue, cvmat);
  }

  GST_DEBUG_OBJECT (dsexample, "created CV Mat\n");
#else
  for (int i = 0; i < 2; i++) {
    if (NvBufSurfaceCreate (&inter_buf, dsexample->max_batch_size,
          &create_params) != 0) {
      GST_ERROR ("Error: Could not allocate internal buffer for dsexample");
      goto error;
    }

    g_queue_push_tail (dsexample->buf_queue, inter_buf);
  }
#endif

  /* Set the NvBufSurfTransform config parameters. */
  dsexample->transform_config_params.compute_mode =
      NvBufSurfTransformCompute_Default;
  dsexample->transform_config_params.gpu_id = dsexample->gpu_id;

  /* Create the intermediate NvBufSurface structure for holding an array of input
   * NvBufSurfaceParams for batched transforms. */
  dsexample->batch_insurf.surfaceList =
      new NvBufSurfaceParams[dsexample->max_batch_size];
  dsexample->batch_insurf.batchSize = dsexample->max_batch_size;
  dsexample->batch_insurf.gpuId = dsexample->gpu_id;

  /* Set up the NvBufSurfTransformParams structure for batched transforms. */
  dsexample->transform_params.src_rect =
      new NvBufSurfTransformRect[dsexample->max_batch_size];
  dsexample->transform_params.dst_rect =
      new NvBufSurfTransformRect[dsexample->max_batch_size];
  dsexample->transform_params.transform_flag =
      NVBUFSURF_TRANSFORM_FILTER | NVBUFSURF_TRANSFORM_CROP_SRC |
      NVBUFSURF_TRANSFORM_CROP_DST;
  dsexample->transform_params.transform_flip = NvBufSurfTransform_None;
  dsexample->transform_params.transform_filter =
      NvBufSurfTransformInter_Default;

  /* Start a thread which will pop output from the algorithm, form NvDsMeta and
   * push buffers to the next element. */
  dsexample->process_thread =
      g_thread_new ("dsexample-process-thread", gst_dsexample_output_loop,
      dsexample);

  dsexample->nvtx_domain = nvtx_domain_ptr.release ();

  return TRUE;
error:

  delete[]dsexample->transform_params.src_rect;
  delete[]dsexample->transform_params.dst_rect;
  delete[]dsexample->batch_insurf.surfaceList;

  if (dsexample->cuda_stream) {
    cudaStreamDestroy (dsexample->cuda_stream);
    dsexample->cuda_stream = NULL;
  }
  if (dsexample->dsexamplelib_ctx)
    DsExampleCtxDeinit (dsexample->dsexamplelib_ctx);
  return FALSE;
}

/**
 * Stop the process thread and free up all the resources
 */
static gboolean
gst_dsexample_stop (GstBaseTransform * btrans)
{
  GstDsExample *dsexample = GST_DSEXAMPLE (btrans);

#ifdef WITH_OPENCV
  cv::Mat * cvmat;
#else
  NvBufSurface * inter_buf;
#endif

  g_mutex_lock (&dsexample->process_lock);

  /* Wait till all the items in the queue are handled. */
  while (!g_queue_is_empty (dsexample->process_queue)) {
    g_cond_wait (&dsexample->process_cond, &dsexample->process_lock);
  }

#ifdef WITH_OPENCV
  while (!g_queue_is_empty (dsexample->buf_queue)) {
    cvmat = (cv::Mat *) g_queue_pop_head (dsexample->buf_queue);
    delete[]cvmat;
    cvmat = NULL;
  }
#else
  while (!g_queue_is_empty (dsexample->buf_queue)) {
    inter_buf = (NvBufSurface *) g_queue_pop_head (dsexample->buf_queue);
    if (inter_buf)
      NvBufSurfaceDestroy (inter_buf);
    inter_buf = NULL;
  }
#endif
  dsexample->stop = TRUE;

  g_cond_broadcast (&dsexample->process_cond);
  g_mutex_unlock (&dsexample->process_lock);

  g_thread_join (dsexample->process_thread);

#ifdef WITH_OPENCV
  if (dsexample->inter_buf)
    NvBufSurfaceDestroy (dsexample->inter_buf);
  dsexample->inter_buf = NULL;
#endif

  if (dsexample->cuda_stream)
    cudaStreamDestroy (dsexample->cuda_stream);
  dsexample->cuda_stream = NULL;

  delete[]dsexample->transform_params.src_rect;
  delete[]dsexample->transform_params.dst_rect;
  delete[]dsexample->batch_insurf.surfaceList;

#ifdef WITH_OPENCV
  GST_DEBUG_OBJECT (dsexample, "deleted CV Mat \n");
#endif

  // Deinit the algorithm library
  DsExampleCtxDeinit (dsexample->dsexamplelib_ctx);
  dsexample->dsexamplelib_ctx = NULL;

  GST_DEBUG_OBJECT (dsexample, "ctx lib released \n");

  g_queue_free (dsexample->process_queue);

  g_queue_free (dsexample->buf_queue);

  return TRUE;
}

/**
 * Called when source / sink pad capabilities have been negotiated.
 */
static gboolean
gst_dsexample_set_caps (GstBaseTransform * btrans, GstCaps * incaps,
    GstCaps * outcaps)
{
  GstDsExample *dsexample = GST_DSEXAMPLE (btrans);
  /* Save the input video information, since this will be required later. */
  gst_video_info_from_caps (&dsexample->video_info, incaps);

  CHECK_CUDA_STATUS (cudaSetDevice (dsexample->gpu_id),
      "Unable to set cuda device");

  return TRUE;

error:
  return FALSE;
}

/**
 * Scale the entire frame to the processing resolution maintaining aspect ratio.
 * Or crop and scale objects to the processing resolution maintaining the aspect
 * ratio and fills data for batched conversation */
static GstFlowReturn
scale_and_fill_data(GstDsExample * dsexample,
    NvBufSurfaceParams * src_frame, NvOSD_RectParams * crop_rect_params,
    gdouble & ratio, gint input_width, gint input_height)
{

  gint src_left = GST_ROUND_UP_2((unsigned int)crop_rect_params->left);
  gint src_top = GST_ROUND_UP_2((unsigned int)crop_rect_params->top);
  gint src_width = GST_ROUND_DOWN_2((unsigned int)crop_rect_params->width);
  gint src_height = GST_ROUND_DOWN_2((unsigned int)crop_rect_params->height);

  // Maintain aspect ratio
  double hdest = dsexample->processing_width * src_height / (double) src_width;
  double wdest = dsexample->processing_height * src_width / (double) src_height;
  guint dest_width, dest_height;

  if (hdest <= dsexample->processing_height) {
    dest_width = dsexample->processing_width;
    dest_height = hdest;
  } else {
    dest_width = wdest;
    dest_height = dsexample->processing_height;
  }

  // Calculate scaling ratio while maintaining aspect ratio
  ratio = MIN (1.0 * dest_width / src_width, 1.0 * dest_height / src_height);

  if ((crop_rect_params->width == 0) || (crop_rect_params->height == 0)) {
    GST_ELEMENT_ERROR (dsexample, STREAM, FAILED,
        ("%s:crop_rect_params dimensions are zero", __func__), (NULL));
    return GST_FLOW_ERROR;
  }
#ifdef __aarch64__
  if (ratio <= 1.0 / 16 || ratio >= 16.0) {
    // Currently cannot scale by ratio > 16 or < 1/16 for Jetson
    return GST_FLOW_ERROR;
  }
#endif

  /* We will first convert only the Region of Interest (the entire frame or the
   * object bounding box) to RGB and then scale the converted RGB frame to
   * processing resolution. */
  GST_DEBUG_OBJECT (dsexample, "Scaling and converting input buffer\n");

  /* Create temporary src and dest surfaces for NvBufSurfTransform API. */
  dsexample->batch_insurf.surfaceList[dsexample->batch_insurf.numFilled] = *src_frame;

  /* Set the source ROI. Could be entire frame or an object. */
  dsexample->transform_params.src_rect[dsexample->batch_insurf.numFilled] = {
  (guint) src_top, (guint) src_left, (guint) src_width, (guint) src_height};
  /* Set the dest ROI. Could be the entire destination frame or part of it to
   * maintain aspect ratio. */
  dsexample->transform_params.dst_rect[dsexample->batch_insurf.numFilled] = {
  0, 0, dest_width, dest_height};

  dsexample->batch_insurf.numFilled++;

  return GST_FLOW_OK;
}

static gboolean
convert_batch_and_push_to_process_thread (GstDsExample * dsexample,
    GstDsExampleBatch * batch)
{

  NvBufSurfTransform_Error err;
  NvBufSurfTransformConfigParams transform_config_params;
  std::string nvtx_str;
#ifdef WITH_OPENCV
  cv::Mat in_mat;
#endif

  // Configure transform session parameters for the transformation
  transform_config_params.compute_mode = NvBufSurfTransformCompute_Default;
  transform_config_params.gpu_id = dsexample->gpu_id;
  transform_config_params.cuda_stream = dsexample->cuda_stream;

  err = NvBufSurfTransformSetSessionParams (&transform_config_params);
  if (err != NvBufSurfTransformError_Success) {
    GST_ELEMENT_ERROR (dsexample, STREAM, FAILED,
        ("NvBufSurfTransformSetSessionParams failed with error %d", err),
        (NULL));
    return FALSE;
  }

  nvtxEventAttributes_t eventAttrib = {0};
  eventAttrib.version = NVTX_VERSION;
  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
  eventAttrib.colorType = NVTX_COLOR_ARGB;
  eventAttrib.color = 0xFFFF0000;
  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
  nvtx_str = "convert_buf batch_num=" + std::to_string(dsexample->current_batch_num);
  eventAttrib.message.ascii = nvtx_str.c_str();

  nvtxDomainRangePushEx(dsexample->nvtx_domain, &eventAttrib);

  g_mutex_lock (&dsexample->process_lock);

  /* Wait if buf queue is empty. */
  while (g_queue_is_empty (dsexample->buf_queue)) {
    g_cond_wait (&dsexample->buf_cond, &dsexample->process_lock);
  }

#ifdef WITH_OPENCV
  /* Pop a buffer from the element's buf queue. */
  batch->cvmat = (cv::Mat *) g_queue_pop_head (dsexample->buf_queue);
#else
  /* Pop a buffer from the element's buf queue. */
  batch->inter_buf = (NvBufSurface *) g_queue_pop_head (dsexample->buf_queue);
  dsexample->inter_buf = batch->inter_buf;
#endif

  g_mutex_unlock (&dsexample->process_lock);

  //Memset the memory
  for (uint i = 0; i < dsexample->batch_insurf.numFilled; i++)
    NvBufSurfaceMemSet (dsexample->inter_buf, i, 0, 0);

  /* Batched tranformation. */
  err = NvBufSurfTransform (&dsexample->batch_insurf, dsexample->inter_buf,
      &dsexample->transform_params);

  nvtxDomainRangePop (dsexample->nvtx_domain);

  if (err != NvBufSurfTransformError_Success) {
    GST_ELEMENT_ERROR (dsexample, STREAM, FAILED,
        ("NvBufSurfTransform failed with error %d while converting buffer",
            err), (NULL));
    return FALSE;
  }

  // Use openCV to remove padding and convert RGBA to BGR. Can be skipped if
  // algorithm can handle padded RGBA data.
  for (guint i = 0; i < dsexample->batch_insurf.numFilled; i++) {
    // Map the buffer so that it can be accessed by CPU
    if (NvBufSurfaceMap (dsexample->inter_buf, i, 0, NVBUF_MAP_READ) != 0) {
      GST_ELEMENT_ERROR (dsexample, STREAM, FAILED,
          ("%s:buffer map to be accessed by CPU failed", __func__), (NULL));
      return FALSE;
    }
    // sync mapped data for CPU access
    NvBufSurfaceSyncForCpu (dsexample->inter_buf, i,0);

#ifdef WITH_OPENCV
    in_mat =
        cv::Mat (dsexample->processing_height, dsexample->processing_width,
        CV_8UC4,dsexample->inter_buf->surfaceList[i].mappedAddr.addr[0],
      dsexample->inter_buf->surfaceList[i].pitch);

#if (CV_MAJOR_VERSION >= 4)
  cv::cvtColor (in_mat, batch->cvmat[i], cv::COLOR_RGBA2BGR);
#else
  cv::cvtColor (in_mat, batch->cvmat[i], CV_RGBA2BGR);
#endif

#ifdef DSEXAMPLE_DEBUG
  static guint cnt = 0;
  cv::imwrite("out_" + std::to_string (cnt) + ".jpeg", batch->cvmat[i]);
  cnt++;
#endif
#endif

    if (NvBufSurfaceUnMap (dsexample->inter_buf, i,0)) {
      GST_ELEMENT_ERROR (dsexample, STREAM, FAILED,
        ("%s:buffer unmap to be accessed by CPU failed", __func__), (NULL));
      return FALSE;
    }

#ifdef __aarch64__
  // To use the converted buffer in CUDA, create an EGLImage and then use
  // CUDA-EGL interop APIs
  if (USE_EGLIMAGE) {
    if (NvBufSurfaceMapEglImage (dsexample->inter_buf, 0) != 0) {
      GST_ELEMENT_ERROR (dsexample, STREAM, FAILED,
        ("%s:buffer map eglimage failed", __func__), (NULL));
      return FALSE;
    }
    // dsexample->inter_buf->surfaceList[0].mappedAddr.eglImage
    // Use interop APIs cuGraphicsEGLRegisterImage and
    // cuGraphicsResourceGetMappedEglFrame to access the buffer in CUDA

    // Destroy the EGLImage
    NvBufSurfaceUnMapEglImage (dsexample->inter_buf, 0);
  }
#endif
  }

  /* Push the batch info structure in the processing queue and notify the process
   * thread that a new batch has been queued. */
  g_mutex_lock (&dsexample->process_lock);

  g_queue_push_tail (dsexample->process_queue, batch);
  g_cond_broadcast (&dsexample->process_cond);

  g_mutex_unlock (&dsexample->process_lock);

  return TRUE;
}

/**
 * Called when element recieves an input buffer from upstream element.
 */
static GstFlowReturn
gst_dsexample_submit_input_buffer (GstBaseTransform * btrans,
    gboolean discont, GstBuffer * inbuf)
{
  GstDsExample *dsexample = GST_DSEXAMPLE (btrans);
  GstMapInfo in_map_info;
  NvBufSurface *in_surf;
  GstDsExampleBatch *buf_push_batch;
  GstFlowReturn flow_ret;
  std::string nvtx_str;
  std::unique_ptr < GstDsExampleBatch > batch = nullptr;

  NvDsBatchMeta *batch_meta = NULL;
  guint i = 0;
  gdouble scale_ratio = 1.0;
  guint num_filled = 0;

  dsexample->current_batch_num++;

  nvtxEventAttributes_t eventAttrib = {0};
  eventAttrib.version = NVTX_VERSION;
  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
  eventAttrib.colorType = NVTX_COLOR_ARGB;
  eventAttrib.color = 0xFFFF0000;
  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
  nvtx_str = "buffer_process batch_num=" + std::to_string(dsexample->current_batch_num);
  eventAttrib.message.ascii = nvtx_str.c_str();
  nvtxRangeId_t buf_process_range = nvtxDomainRangeStartEx(dsexample->nvtx_domain, &eventAttrib);

  memset (&in_map_info, 0, sizeof (in_map_info));

  /* Map the buffer contents and get the pointer to NvBufSurface. */
  if (!gst_buffer_map (inbuf, &in_map_info, GST_MAP_READ)) {
    GST_ELEMENT_ERROR (dsexample, STREAM, FAILED,
        ("%s:gst buffer map to get pointer to NvBufSurface failed", __func__), (NULL));
    return GST_FLOW_ERROR;
  }
  in_surf = (NvBufSurface *) in_map_info.data;

  nvds_set_input_system_timestamp (inbuf, GST_ELEMENT_NAME (dsexample));

  batch_meta = gst_buffer_get_nvds_batch_meta (inbuf);
  if (batch_meta == nullptr) {
    GST_ELEMENT_ERROR (dsexample, STREAM, FAILED,
        ("NvDsBatchMeta not found for input buffer."), (NULL));
    return GST_FLOW_ERROR;
  }
  num_filled = batch_meta->num_frames_in_batch;

  if (dsexample->process_full_frame) {
    for (guint i = 0; i < num_filled; i++) {
      NvOSD_RectParams rect_params;

      // Scale the entire frame to processing resolution
      rect_params.left = 0;
      rect_params.top = 0;
      rect_params.width = in_surf->surfaceList[i].width;
      rect_params.height = in_surf->surfaceList[i].height;

      // Scale the frame maintaining aspect ratio
      if (scale_and_fill_data (dsexample, in_surf->surfaceList + i,
              &rect_params, scale_ratio, dsexample->video_info.width,
              dsexample->video_info.height) != GST_FLOW_OK) {
        goto error;
      }

      if (batch == nullptr) {
        batch.reset (new GstDsExampleBatch);
        batch->push_buffer = FALSE;
        batch->inbuf = inbuf;
        batch->inbuf_batch_num = dsexample->current_batch_num;
      }

      /* Adding a frame to the current batch. Set the frames members. */
      GstDsExampleFrame frame;
      frame.scale_ratio_x = scale_ratio;
      frame.scale_ratio_y = scale_ratio;
      frame.obj_meta = nullptr;
      frame.frame_meta =
          nvds_get_nth_frame_meta (batch_meta->frame_meta_list, i);
      frame.frame_num = frame.frame_meta->frame_num;
      frame.batch_index = i;
      frame.input_surf_params = in_surf->surfaceList + i;
      batch->frames.push_back (frame);

      // Set the transform session parameters for the conversions executed in this
      // thread.
      if (batch->frames.size () == dsexample->max_batch_size || i == num_filled) {
        if (!convert_batch_and_push_to_process_thread (dsexample, batch.get ())) {
          return GST_FLOW_ERROR;
        }
        /* Batch submitted. Set batch to nullptr so that a new GstDsExampleBatch
         * structure can be allocated if required. */
        batch.release ();
        dsexample->batch_insurf.numFilled = 0;
      }
    }
  } else {
    // Using object crops as input to the algorithm. The objects are detected by
    // the primary detector
    NvDsFrameMeta *frame_meta = NULL;
    NvDsMetaList *l_frame = NULL;
    NvDsObjectMeta *obj_meta = NULL;
    NvDsMetaList *l_obj = NULL;

    for (l_frame = batch_meta->frame_meta_list; l_frame != NULL;
        l_frame = l_frame->next) {
      frame_meta = (NvDsFrameMeta *) (l_frame->data);
      for (l_obj = frame_meta->obj_meta_list; l_obj != NULL;
          l_obj = l_obj->next) {
        obj_meta = (NvDsObjectMeta *) (l_obj->data);

        /* Should not process on objects smaller than MIN_INPUT_OBJECT_WIDTH x MIN_INPUT_OBJECT_HEIGHT
         * since it will cause hardware scaling issues. */
        if (obj_meta->rect_params.width < MIN_INPUT_OBJECT_WIDTH ||
            obj_meta->rect_params.height < MIN_INPUT_OBJECT_HEIGHT)
          continue;

        // Crop and scale the object maintainig aspect ratio
        if (scale_and_fill_data (dsexample,
                in_surf->surfaceList + frame_meta->batch_id,
                &obj_meta->rect_params, scale_ratio,
                dsexample->video_info.width,
                dsexample->video_info.height) != GST_FLOW_OK) {
          // Error in conversion, skip processing on object. */
          continue;
        }

        if (batch == nullptr) {
          batch.reset (new GstDsExampleBatch);
          batch->push_buffer = FALSE;
          batch->inbuf = inbuf;
          batch->inbuf_batch_num = dsexample->current_batch_num;
          batch->nvtx_complete_buf_range = buf_process_range;
        }

        /* Adding a frame to the current batch. Set the frames members. */
        GstDsExampleFrame frame;
        frame.scale_ratio_x = scale_ratio;
        frame.scale_ratio_y = scale_ratio;
        frame.obj_meta = obj_meta;
        frame.frame_meta =
            nvds_get_nth_frame_meta (batch_meta->frame_meta_list, i);
        frame.frame_num = frame.frame_meta->frame_num;
        frame.batch_index = i;
        frame.input_surf_params = in_surf->surfaceList + i;
        batch->frames.push_back (frame);

        i++;

        // Convert batch and push to process thread
        if (batch->frames.size () == dsexample->max_batch_size
            || i == num_filled) {
          if (!convert_batch_and_push_to_process_thread (dsexample,
                  batch.get ())) {
            return GST_FLOW_ERROR;
          }
          /* Batch submitted. Set batch to nullptr so that a new GstDsExampleBatch
           * structure can be allocated if required. */
          i = 0;
          batch.release ();
          dsexample->batch_insurf.numFilled = 0;
        }
      }
    }
  }
    /* Submit a non-full batch. */
  if (batch) {
    if (!convert_batch_and_push_to_process_thread (dsexample, batch.get ())) {
      return GST_FLOW_ERROR;
    }
    batch.release ();
    dsexample->batch_insurf.numFilled = 0;
  }

  nvtxDomainRangeEnd(dsexample->nvtx_domain, buf_process_range);

  /* Queue a push buffer batch. This batch is not inferred. This batch is to
   * signal the process thread that there are no more batches
   * belonging to this input buffer and this GstBuffer can be pushed to
   * downstream element once all the previous processing is done. */
  buf_push_batch = new GstDsExampleBatch;
  buf_push_batch->inbuf = inbuf;
  buf_push_batch->push_buffer = TRUE;
  buf_push_batch->nvtx_complete_buf_range = buf_process_range;

  g_mutex_lock (&dsexample->process_lock);
  /* Check if this is a push buffer or event marker batch. If yes, no need to
   * queue the input for inferencing. */
  if (buf_push_batch->push_buffer) {
    /* Push the batch info structure in the processing queue and notify the
     * process thread that a new batch has been queued. */
    g_queue_push_tail (dsexample->process_queue, buf_push_batch);
    g_cond_broadcast (&dsexample->process_cond);
  }
  g_mutex_unlock (&dsexample->process_lock);

  flow_ret = GST_FLOW_OK;

error:
  gst_buffer_unmap (inbuf, &in_map_info);
  return flow_ret;
}

/**
 * If submit_input_buffer is implemented, it is mandatory to implement
 * generate_output. Buffers are not pushed to the downstream element from here.
 * Return the GstFlowReturn value of the latest pad push so that any error might
 * be caught by the application.
 */
static GstFlowReturn
gst_dsexample_generate_output (GstBaseTransform * btrans, GstBuffer ** outbuf)
{
  GstDsExample *dsexample = GST_DSEXAMPLE (btrans);
  return dsexample->last_flow_ret;
}

/**
 * Attach metadata for the full frame. We will be adding a new metadata.
 */
static void
attach_metadata_full_frame (GstDsExample * dsexample,
    NvDsFrameMeta * frame_meta, gdouble scale_ratio, DsExampleOutput * output,
    guint batch_id)
{
  NvDsBatchMeta *batch_meta = frame_meta->base_meta.batch_meta;
  NvDsObjectMeta *object_meta = NULL;
  static gchar font_name[] = "Serif";
  GST_DEBUG_OBJECT (dsexample, "Attaching metadata %d\n", output->numObjects);

  for (gint i = 0; i < output->numObjects; i++) {
    DsExampleObject *obj = &output->object[i];
    object_meta = nvds_acquire_obj_meta_from_pool (batch_meta);
    NvOSD_RectParams & rect_params = object_meta->rect_params;
    NvOSD_TextParams & text_params = object_meta->text_params;

    // Assign bounding box coordinates
    rect_params.left = obj->left;
    rect_params.top = obj->top;
    rect_params.width = obj->width;
    rect_params.height = obj->height;

    // Semi-transparent yellow background
    rect_params.has_bg_color = 0;
    rect_params.bg_color = (NvOSD_ColorParams) {
    1, 1, 0, 0.4};
    // Red border of width 6
    rect_params.border_width = 3;
    rect_params.border_color = (NvOSD_ColorParams) {
    1, 0, 0, 1};

    // Scale the bounding boxes proportionally based on how the object/frame was
    // scaled during input
    rect_params.left /= scale_ratio;
    rect_params.top /= scale_ratio;
    rect_params.width /= scale_ratio;
    rect_params.height /= scale_ratio;
    GST_DEBUG_OBJECT (dsexample, "Attaching rect%d of batch%u"
        "  left->%f top->%f width->%f"
        " height->%f label->%s\n", i, batch_id, rect_params.left,
        rect_params.top, rect_params.width, rect_params.height, obj->label);

    object_meta->object_id = UNTRACKED_OBJECT_ID;
    g_strlcpy (object_meta->obj_label, obj->label, MAX_LABEL_SIZE);
    // display_text required heap allocated memory
    text_params.display_text = g_strdup (obj->label);
    // Display text above the left top corner of the object
    text_params.x_offset = rect_params.left;
    text_params.y_offset = rect_params.top - 10;
    // Set black background for the text
    text_params.set_bg_clr = 1;
    text_params.text_bg_clr = (NvOSD_ColorParams) {
    0, 0, 0, 1};
    // Font face, size and color
    text_params.font_params.font_name = font_name;
    text_params.font_params.font_size = 11;
    text_params.font_params.font_color = (NvOSD_ColorParams) {
    1, 1, 1, 1};

    nvds_add_obj_meta_to_frame (frame_meta, object_meta, NULL);
  }
}

/**
 * Only update string label in an existing object metadata. No bounding boxes.
 * We assume only one label per object is generated
 */
static void
attach_metadata_object (GstDsExample * dsexample, NvDsObjectMeta * obj_meta,
    DsExampleOutput * output)
{
  if (output->numObjects == 0)
    return;
  NvDsBatchMeta *batch_meta = obj_meta->base_meta.batch_meta;

  NvDsClassifierMeta *classifier_meta =
      nvds_acquire_classifier_meta_from_pool (batch_meta);

  classifier_meta->unique_component_id = dsexample->unique_id;

  NvDsLabelInfo *label_info =
      nvds_acquire_label_info_meta_from_pool (batch_meta);
  g_strlcpy (label_info->result_label, output->object[0].label, MAX_LABEL_SIZE);
  nvds_add_label_info_meta_to_classifier (classifier_meta, label_info);
  nvds_add_classifier_meta_to_object (obj_meta, classifier_meta);

  nvds_acquire_meta_lock (batch_meta);
  NvOSD_TextParams & text_params = obj_meta->text_params;
  NvOSD_RectParams & rect_params = obj_meta->rect_params;

  /* Below code to display the result */
  // Set black background for the text
  // display_text required heap allocated memory
  if (text_params.display_text) {
    gchar *conc_string = g_strconcat (text_params.display_text, " ",
        output->object[0].label, NULL);
    g_free (text_params.display_text);
    text_params.display_text = conc_string;
  } else {
    // Display text above the left top corner of the object
    text_params.x_offset = rect_params.left;
    text_params.y_offset = rect_params.top - 10;
    text_params.display_text = g_strdup (output->object[0].label);
    // Font face, size and color
    text_params.font_params.font_name = (char *) "Serif";
    text_params.font_params.font_size = 11;
    text_params.font_params.font_color = (NvOSD_ColorParams) {
    1, 1, 1, 1};
    // Set black background for the text
    text_params.set_bg_clr = 1;
    text_params.text_bg_clr = (NvOSD_ColorParams) {
    0, 0, 0, 1};
  }
  nvds_release_meta_lock (batch_meta);
}

/**
 * Output loop used to pop output from processing thread, attach the output to the
 * buffer in form of NvDsMeta and push the buffer to downstream element.
 */
static gpointer
gst_dsexample_output_loop (gpointer data)
{
  GstDsExample *dsexample = GST_DSEXAMPLE (data);
  DsExampleOutput *output;
  NvDsObjectMeta *obj_meta = NULL;
  gdouble scale_ratio = 1.0;

  nvtxEventAttributes_t eventAttrib = {0};
  eventAttrib.version = NVTX_VERSION;
  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
  eventAttrib.colorType = NVTX_COLOR_ARGB;
  eventAttrib.color = 0xFFFF0000;
  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
  std::string nvtx_str;

  nvtx_str =
      "gst-dsexample_output-loop_uid=" + std::to_string (dsexample->unique_id);

  g_mutex_lock (&dsexample->process_lock);

  /* Run till signalled to stop. */
  while (!dsexample->stop) {
    std::unique_ptr < GstDsExampleBatch > batch = nullptr;

    /* Wait if processing queue is empty. */
    if (g_queue_is_empty (dsexample->process_queue)) {
      g_cond_wait (&dsexample->process_cond, &dsexample->process_lock);
      continue;
    }

    /* Pop a batch from the element's process queue. */
    batch.reset ((GstDsExampleBatch *)
        g_queue_pop_head (dsexample->process_queue));
    g_cond_broadcast (&dsexample->process_cond);

    /* Event marker used for synchronization. No need to process further. */
    if (batch->event_marker) {
      continue;
    }

    g_mutex_unlock (&dsexample->process_lock);

    /* Need to only push buffer to downstream element. This batch was not
     * actually submitted for inferencing. */
    if (batch->push_buffer) {
      nvtxDomainRangeEnd(dsexample->nvtx_domain, batch->nvtx_complete_buf_range);

      nvds_set_output_system_timestamp (batch->inbuf,
          GST_ELEMENT_NAME (dsexample));

      GstFlowReturn flow_ret =
          gst_pad_push (GST_BASE_TRANSFORM_SRC_PAD (dsexample),
          batch->inbuf);
      if (dsexample->last_flow_ret != flow_ret) {
        switch (flow_ret) {
            /* Signal the application for pad push errors by posting a error message
             * on the pipeline bus. */
          case GST_FLOW_ERROR:
          case GST_FLOW_NOT_LINKED:
          case GST_FLOW_NOT_NEGOTIATED:
            GST_ELEMENT_ERROR (dsexample, STREAM, FAILED,
                ("Internal data stream error."),
                ("streaming stopped, reason %s (%d)",
                    gst_flow_get_name (flow_ret), flow_ret));
            break;
          default:
            break;
        }
      }
      dsexample->last_flow_ret = flow_ret;
      g_mutex_lock (&dsexample->process_lock);
      continue;
    }

    nvtx_str = "dequeueOutputAndAttachMeta batch_num=" + std::to_string(batch->inbuf_batch_num);
    eventAttrib.message.ascii = nvtx_str.c_str();
    nvtxDomainRangePushEx(dsexample->nvtx_domain, &eventAttrib);

    /* For each frame attach metadata output. */
    for (guint i = 0; i < batch->frames.size (); i++) {
      if (dsexample->process_full_frame) {
        // Process to get the output
#ifdef WITH_OPENCV
        output =
            DsExampleProcess (dsexample->dsexamplelib_ctx,
            batch->cvmat[i].data);
#else
        output =
            DsExampleProcess (dsexample->dsexamplelib_ctx,
            (unsigned char *)batch->inter_buf->surfaceList[i].mappedAddr.addr[0]);
#endif
        // Attach the metadata for the full frame
        attach_metadata_full_frame (dsexample, batch->frames[i].frame_meta,
            scale_ratio, output, i);
        free (output);
      } else {
        GstDsExampleFrame & frame = batch->frames[i];

        obj_meta = frame.obj_meta;

        /* Should not process on objects smaller than MIN_INPUT_OBJECT_WIDTH x MIN_INPUT_OBJECT_HEIGHT
         * since it will cause hardware scaling issues. */
        if (obj_meta->rect_params.width < MIN_INPUT_OBJECT_WIDTH ||
            obj_meta->rect_params.height < MIN_INPUT_OBJECT_HEIGHT)
          continue;

        // Process the object crop to obtain label
#ifdef WITH_OPENCV
        output = DsExampleProcess (dsexample->dsexamplelib_ctx,
            batch->cvmat[i].data);
#else
        output = DsExampleProcess (dsexample->dsexamplelib_ctx,
            (unsigned char *)batch->inter_buf->surfaceList[i].mappedAddr.addr[0]);
#endif

        // Attach labels for the object
        attach_metadata_object (dsexample, obj_meta, output);

        free (output);
      }
    }

    g_mutex_lock (&dsexample->process_lock);

#ifdef WITH_OPENCV
    g_queue_push_tail (dsexample->buf_queue, batch->cvmat);
#else
    g_queue_push_tail (dsexample->buf_queue, batch->inter_buf);
#endif
    g_cond_broadcast (&dsexample->buf_cond);

    nvtxDomainRangePop (dsexample->nvtx_domain);
  }
  g_mutex_unlock (&dsexample->process_lock);

  return nullptr;
}

/**
 * Boiler plate for registering a plugin and an element.
 */
static gboolean
dsexample_plugin_init (GstPlugin * plugin)
{
  GST_DEBUG_CATEGORY_INIT (gst_dsexample_debug, "dsexample", 0,
      "dsexample plugin");

  return gst_element_register (plugin, "dsexample", GST_RANK_PRIMARY,
      GST_TYPE_DSEXAMPLE);
}

GST_PLUGIN_DEFINE (GST_VERSION_MAJOR,
    GST_VERSION_MINOR,
    nvdsgst_dsexample,
    DESCRIPTION, dsexample_plugin_init, "6.0", LICENSE, BINARY_PACKAGE,
    URL)