huangmeng
/
EZ_inference


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627
							#include <stdio.h>
#include <stdlib.h>
#include <VX/vx.h>
#include <vx_ext_amd.h>
#include <net_api.h>
#include <math.h>
#include <memory.h>
#include <algorithm>
#include <vx_log.h>
#include <unistd.h>
#include "yolo.h"
#include <ax_type.h>
#include "ansjer_ai_cfg.h"

#ifndef CLIPRETINA
#define CLIPRETINA(v,mn,mx)				\
{							\
	if ((v)<(mn))		{(v)=(mn);}	\
	else if((v)>(mx))	{(v)=(mx);}	\
}
#endif

#define JSONPATH "/platform/ax/model/"

typedef struct {
	vx_size dst_handle;
	vx_context context;
	vx_context handle_context;
	vx_graph graph;
    // net_work_width
	int net_w;
    // net_work_hight
	int net_h;
    // input_image_width
	int src_w;
    // input_image_hight
	int src_h;
    // wh_scale
	float scale_info;

	unsigned int strides[3];
	int nBlobSz;
	ax_nna_tensors_t pOutTensor;

	std::string model_type;
	unsigned int anchors_g[18];
	std::vector<std::string> yolo_outputs_name_g;
	std::vector<std::string> label;
	bool need_nu_freq;
    int freq_nu;
    bool need_vu_freq;
    int freq_vu;

	float conf_thresh;
    float iou_thresh;
} nna_yolo_private_t;

struct Box {
	float xyxy[4] = { 0, 0, 0, 0 };
	float object_score = 0;
	size_t index = 0;
	float score = 0;
	float area = 0;
};

static float sigmoid(float x)
{
	return 1.0 / (1.0 + exp(-x));
}

static void dfl_process(const std::vector<float> &dfl, std::vector<float> &dst, int reg_max=16)
{
	std::vector<float> temp;
	float max_value, denominator, sum_value;
	for(int i = 0; i < int(dfl.size() / reg_max); ++i){
		max_value = *std::max_element(dfl.begin() + int(i * reg_max), dfl.begin() + int((i + 1) * reg_max));
		denominator = 0;
    	sum_value = 0;

		for(int j = int(i * reg_max); j < int((i + 1) * reg_max);++j){
			temp.push_back(std::exp(dfl[j] - max_value));
			denominator += temp[j];
		}
		for(int j = int(i * reg_max); j < int((i + 1) * reg_max);++j){
			temp[j] /= denominator;
			sum_value += temp[j] * (j - int(i * reg_max));
		}
		dst.push_back(sum_value);
	}
}

void RefineFileName(int len, char * fname)
{
	for (int n = 0; n < len; n++) {
		if (fname[n] == '/' || 
			fname[n] == '(' || 
			fname[n] == ')' ||
			fname[n] == ',' ||
			fname[n] == ' ' ||
			fname[n] == '[' ||
			fname[n] == ']' ||
			fname[n] == ':') 
		{
			fname[n] = '_';
		}
	}
}

int DumpBlobsInfo(std::vector<std::string> & blobs_name, std::vector<vx_tensor> & blobs_tensor)
{
	for (int i = 0; i < blobs_name.size(); i++) {
		vx_size dims[4];
		ERROR_CHECK_STATUS(vxQueryTensor(blobs_tensor.at(i), VX_TENSOR_DIMS, dims, sizeof(dims)));		
		vx_enum data_type;
		ERROR_CHECK_STATUS(vxQueryTensor(blobs_tensor.at(i), VX_TENSOR_DATA_TYPE, &data_type, sizeof(data_type)));
		vx_size fixed_pos;
		ERROR_CHECK_STATUS(vxQueryTensor(blobs_tensor.at(i), VX_TENSOR_FIXED_POINT_POSITION, &fixed_pos, sizeof(fixed_pos)));
		vx_size data_size;
		ERROR_CHECK_STATUS(vxQueryTensor(blobs_tensor.at(i), VX_TENSOR_MEMORY_SIZE, &data_size, sizeof(data_size)));
		vx_size data_handle;
		ERROR_CHECK_STATUS(vxQueryTensor(blobs_tensor.at(i), VX_TENSOR_MEMORY_HANDLE, &data_handle, sizeof(data_handle)));		
		vx_size data_virt_addr;
		ERROR_CHECK_STATUS(vxQueryTensor(blobs_tensor.at(i), VX_TENSOR_MEMORY_VIRT_ADDR, &data_virt_addr, sizeof(data_virt_addr)));
		vx_size data_phys_addr;
		ERROR_CHECK_STATUS(vxQueryTensor(blobs_tensor.at(i), VX_TENSOR_MEMORY_PHYS_ADDR, &data_phys_addr, sizeof(data_phys_addr)));
		
		VX_LOG("name: %s\n", blobs_name.at(i).c_str());
		VX_LOG("\t(w, h, c, b): (%d, %d, %d, %d)\n", dims[0], dims[1], dims[2], dims[3]);
		VX_LOG("\tdata type: %d\n", data_type);
		VX_LOG("\tfixed position: %d\n", fixed_pos);
		VX_LOG("\tdata size: %d\n", data_size);
		VX_LOG("\tdata handle: 0x%x\n", data_handle);
		VX_LOG("\tdata virt addr: 0x%x\n", data_virt_addr);
		VX_LOG("\tdata phys addr: 0x%x\n", data_phys_addr);
	}
	return 0;
}

int DumpBlobsData(std::vector<std::string> & blobs_name, std::vector<vx_tensor> & blobs_tensor)
{
	for (int i = 0; i < blobs_name.size(); i++) {
		vx_size dims[4];
		ERROR_CHECK_STATUS(vxQueryTensor(blobs_tensor.at(i), VX_TENSOR_DIMS, dims, sizeof(dims)));
		vx_size data_size;
		ERROR_CHECK_STATUS(vxQueryTensor(blobs_tensor.at(i), VX_TENSOR_MEMORY_SIZE, &data_size, sizeof(data_size)));
		vx_int8 * data;
		ERROR_CHECK_STATUS(vxQueryTensor(blobs_tensor.at(i), VX_TENSOR_MEMORY_VIRT_ADDR, &data, sizeof(data)));	
		VX_LOG("name: %s, dims: (%d, %d, %d, %d), data size: %d\n", blobs_name.at(i).c_str(), dims[0], dims[1], dims[2], dims[3], data_size);
		char out_file[512];
		sprintf(out_file, "/share/res_hw/%s.bin", blobs_name.at(i).c_str());
		RefineFileName(512, out_file);
		FILE * fin = fopen(out_file, "wb");
		fwrite(data, 1, data_size, fin);
		fclose(fin);
	}
	return 0;
}

static bool yolov5_layer(std::vector<Box>& proposals,
	const signed char* output_ptr,
	const size_t output_size[4],
	const unsigned int& image_size,
	const unsigned int& stride,
	const unsigned int& fl,
	const unsigned int& data_type,
	const unsigned int* anchor,
	const unsigned int& class_num,
	const float& confidence_threshold)
{
	int H_algin = 0;
	if (data_type == 2) H_algin = (output_size[1] + 3) / 4 * 4;
	else H_algin = (output_size[1] + 1) / 2 * 2;

	for (size_t a = 0; a < 3; ++a) { // anchor groups = 3
		for (size_t w = 0; w < output_size[0]; ++w) {
			for (size_t h = 0; h < output_size[1]; ++h) {
				Box box;
				size_t max_index = 0;
				float max_score = -1;
				for (size_t c = 0; c < 4 + 1 + class_num; ++c) {
					size_t ci = a * (4 + 1 + class_num) + c;
					size_t index = ci / 16 * output_size[0] * H_algin * 16 + w * H_algin * 16 + h * 16 + (ci % 16);

					// scale and sigmoid
					float data = sigmoid(output_ptr[index] * 1.0 / pow(2, fl));
					if (c == 0) {
						data = (data * 2 - 0.5f + w) * static_cast<float>(stride);
						box.xyxy[c] = data;
					} else if (c == 1) {
						data = (data * 2 - 0.5f + h) * static_cast<float>(stride);
						box.xyxy[c] = data;
					} else if (c == 2 || c == 3) {
						data = powf((data * 2), 2) * anchor[a * 2 + c - 2];
						box.xyxy[c] = data;
					} else if (c == 4) {
						box.object_score = data;
					} else {
						if (data > max_score) {
							max_index = c - 5;
							max_score = data;
						}
					}
				}
				box.score = max_score * box.object_score;
				box.index = max_index;
				if (box.object_score > confidence_threshold && box.score > confidence_threshold) {
					// xywh -> xyxy
					float x = box.xyxy[0], y = box.xyxy[1], w = box.xyxy[2], h = box.xyxy[3];
					box.xyxy[0] = x - w / 2;
					box.xyxy[1] = y - h / 2;
					box.xyxy[2] = x + w / 2;
					box.xyxy[3] = y + h / 2;
					box.area = (box.xyxy[2] - box.xyxy[0] + 1) * (box.xyxy[3] - box.xyxy[1] + 1);
					proposals.push_back(box);
					// VX_LOG("%s box %f %f %f %f %f %d\n", __FUNCTION__, box.xyxy[0], box.xyxy[1], box.xyxy[2], box.xyxy[3], box.score, box.index);
				}
			}
		}
	}

	return VX_SUCCESS;
}

static bool yolov8_layer(std::vector<Box>& proposals,
	const signed char* output_ptr,
	const size_t output_size[4],
	const unsigned int& image_size,
	const unsigned int& stride,
	const unsigned int& fl,
	const unsigned int& data_type,
	const unsigned int& class_num,
	const float& confidence_threshold)
{
	int reg_max = 16;
	float grid_cell_offset = 0.5f;
	int H_algin = 0;
	if (data_type == 2) H_algin = (output_size[1] + 3) / 4 * 4;
	else H_algin = (output_size[1] + 1) / 2 * 2;

	for (size_t w = 0; w < output_size[0]; ++w) {
		for (size_t h = 0; h < output_size[1]; ++h) {	
			Box box;
			size_t max_index = 0;
			float max_score = -1;
			std::vector<float> dfl, ltrb;
			for (size_t c = 0; c < 4 * reg_max + class_num; ++c) {
				size_t index = c / 16 * output_size[0] * H_algin * 16 + w * H_algin * 16 + h * 16 + c % 16;

				float data = output_ptr[index] * 1.0 / pow(2, fl);
				if (c < 4 * reg_max) {
					dfl.push_back(data);
					if (c == (4 * reg_max - 1)){
						dfl_process(dfl, ltrb, reg_max);
						box.xyxy[0] = (w + grid_cell_offset - ltrb[0]) * stride;
						box.xyxy[1] = (h + grid_cell_offset - ltrb[1]) * stride;
						box.xyxy[2] = (w + grid_cell_offset + ltrb[2]) * stride;
						box.xyxy[3] = (h + grid_cell_offset + ltrb[3]) * stride;
					}
					
				} else {
					if (data > max_score) {
						max_index = c - 4 * reg_max;
						max_score = data;
					}
				}
			}
			box.score = max_score;
			box.index = max_index;
			if (box.score > confidence_threshold) {
				box.area = (box.xyxy[2] - box.xyxy[0] + 1) * (box.xyxy[3] - box.xyxy[1] + 1);
				proposals.push_back(box);
				// VX_LOG("%s box %f %f %f %f %f %d\n", __FUNCTION__, box.xyxy[0], box.xyxy[1], box.xyxy[2], box.xyxy[3], box.score, box.index);
			}
		}
	}	
	return VX_SUCCESS;
}

static bool yolov_box_cmp(const Box& a, const Box& b)
{
	return a.score > b.score;
}

static bool yolo_nms(std::vector<Box>& boxes, const float& nms_threshold)
{
	std::sort(boxes.begin(), boxes.end(), yolov_box_cmp);

	size_t current_index = 0;
	while (current_index < boxes.size()) {
		Box current_box = boxes[current_index];
		size_t running_index = current_index + 1;
		while (running_index < boxes.size()) {
			Box running_box = boxes[running_index];

			float xx1 = std::max(current_box.xyxy[0], running_box.xyxy[0]);
			float yy1 = std::max(current_box.xyxy[1], running_box.xyxy[1]);
			float xx2 = std::min(current_box.xyxy[2], running_box.xyxy[2]);
			float yy2 = std::min(current_box.xyxy[3], running_box.xyxy[3]);
			float w = std::max(0.0f, xx2 - xx1 + 1.0f);
			float h = std::max(0.0f, yy2 - yy1 + 1.0f);

			float inter_area = w * h;
			float union_area = current_box.area + running_box.area - inter_area;
			float overlap = inter_area / union_area;
			if (overlap > nms_threshold) {
				boxes.erase(boxes.begin() + running_index);
			} else {
				++running_index;
			}
		}
		++current_index;
	}

	return VX_SUCCESS;
}

void *nna_custom_det_open(ezax_custom_det_cfg_t *cfg)
{
	// char *filename = (char *)"/data/yolo_config.json";
	// char *filename = (char *)"/platform/ax/model/yolo_config.json";
	// char *filename = (char *)"/share/yolo_config.json";

	// VX_LOG("%s Line:%d %s\n", __FUNCTION__, __LINE__, cfg->model_rootpath);
	char *filename = (char*)malloc(strlen(JSONPATH) + strlen(cfg->model_rootpath) + 1);
	strcpy(filename, JSONPATH);
    strcat(filename, cfg->model_rootpath);
	VX_LOG("%s Line:%d read json from %s\n", __FUNCTION__, __LINE__, filename);
	Asj_Ai_Cfg_t *ai_config; 
	ai_config = (Asj_Ai_Cfg_t *)malloc(sizeof(Asj_Ai_Cfg_t));
	memset(ai_config, 0, sizeof(Asj_Ai_Cfg_t));
	try{
		read_Asj_Ai_Json(ai_config, filename);
	} catch (const std::exception &e) {
		VX_LOG("%s Line:%d read json error..\n", __FUNCTION__, __LINE__);
		return NULL;
	}

    vx_int8* net;
	vx_int8* blobs;
    nna_yolo_private_t *yolo;
    std::vector<std::string> input_blobs_name;
	std::vector<vx_tensor> input_blobs_tensor;
	std::vector<std::string> output_blobs_name;
	std::vector<vx_tensor> output_blobs_tensor;
    vx_size input_data_dims[4] = { 1 };
	vx_size output_data_dims[4] = { 1 };

    yolo = (nna_yolo_private_t *)malloc(sizeof(nna_yolo_private_t));
	memset(yolo, 0, sizeof(nna_yolo_private_t));

	yolo->model_type = ai_config->property.model_type;
	if (yolo->model_type == "yolov5"){
		for (int i = 0; i < 18; i++){
			yolo->anchors_g[i] = ai_config->property.anchors[i];
		}
	}
	yolo->yolo_outputs_name_g = ai_config->property.yolo_outputs_name;
	yolo->need_nu_freq = ai_config->property.need_nu_freq;
	yolo->freq_nu = ai_config->property.freq_nu;
	yolo->need_vu_freq = ai_config->property.need_vu_freq;
	yolo->freq_vu = ai_config->property.freq_vu;
	yolo->conf_thresh = ai_config->class_attrs_all.conf_thresh;
	yolo->iou_thresh = ai_config->class_attrs_all.iou_thresh;
	yolo->label = ai_config->property.label;

    if (yolo == NULL) {
		VX_LOG("%s Line:%d malloc yolo error\n", __FUNCTION__, __LINE__);
		return NULL;
	}

    yolo->context = vxCreateContext();
    ERROR_CHECK_OBJECT(yolo->context);

	ERROR_CHECK_STATUS(vxLoadKernels(yolo->context, "openvx-nn"));
	ERROR_CHECK_STATUS(LoadNetModel(yolo->context, ai_config->property.ezbStr, true, &net));
	ERROR_CHECK_STATUS(LoadNetModel(yolo->context, ai_config->property.binStr, true, &blobs));

    yolo->graph = CreateNetGraph(yolo->context, (vx_uint32*)net, blobs, true);
    ERROR_CHECK_STATUS(vxVerifyGraph(yolo->graph));

	vx_int32 graph_mem = vxGetGraphInOutMemorySize(yolo->graph);
	VX_LOG("%s Line:%d graph_mem:%d.\n", __FUNCTION__, __LINE__, graph_mem);

	// ERROR_CHECK_STATUS(vxProfileGraph(yolo->graph));
	// VX_LOG("%s Line:%d vxProfileGraph success.\n", __FUNCTION__, __LINE__, graph_mem);

    UnLoadNetModel(blobs);
	UnLoadNetModel(net);

    GetNetInputBlob(yolo->graph, input_blobs_name, input_blobs_tensor);
	ERROR_CHECK_STATUS(vxQueryTensor(input_blobs_tensor.at(0), VX_TENSOR_DIMS, input_data_dims, sizeof(input_data_dims)));

    yolo->net_w = input_data_dims[0];
	yolo->net_h = input_data_dims[1];

    yolo->src_w = cfg->width;
    yolo->src_h = cfg->height;

	yolo->scale_info = std::min(yolo->net_w * 1.0 / yolo->src_w, yolo->net_h * 1.0 / yolo->src_h);

	yolo->handle_context = vxCreateContext();
    ERROR_CHECK_OBJECT(yolo->handle_context);

	VX_LOG("%s Line:%d before AllocDeviceImageBuffer.\n", __FUNCTION__, __LINE__);
	yolo->dst_handle = AllocDeviceImageBuffer(yolo->handle_context, NNA_BGRA, yolo->net_w, yolo->net_h);
    if (yolo->dst_handle == 0) {
        VX_LOG("%s Line:%d alloc rgba image buffer error.\n", __FUNCTION__, __LINE__);
        return NULL;
    }
	VX_LOG("%s Line:%d AllocDeviceImageBuffer success.\n", __FUNCTION__, __LINE__);

	VX_LOG("%s Line:%d yolo->model_type:%s yolo->net_w:%d yolo->net_h:%d yolo->src_w:%d yolo->src_h:%d yolo->scale_info:%f\n", __FUNCTION__, __LINE__, yolo->model_type.c_str(), yolo->net_w, yolo->net_h, yolo->src_w, yolo->src_h,
	yolo->scale_info);
    input_blobs_name.clear();
	input_blobs_tensor.clear();

	GetNetOutputBlob(yolo->graph, output_blobs_name, output_blobs_tensor);
	ERROR_CHECK_STATUS(vxQueryTensor(output_blobs_tensor.at(0), VX_TENSOR_DIMS, output_data_dims, sizeof(output_data_dims)));
	yolo->nBlobSz = output_blobs_tensor.size();
	yolo->pOutTensor.nTensor = (ax_nna_tensor_t*)AX_MALLOC(sizeof(ax_nna_tensor_t)*output_blobs_tensor.size());
	yolo->pOutTensor.nTensorNum = output_blobs_tensor.size();

	for (vx_size i = 0; i < output_blobs_tensor.size(); ++i) {
		vx_size output_layer_data_dims[4];
		vx_tensor layer_output = output_blobs_tensor[i];
		ERROR_CHECK_STATUS(vxQueryTensor(layer_output, VX_TENSOR_DIMS, output_layer_data_dims, sizeof(output_layer_data_dims)));
		yolo->pOutTensor.nTensor[i].w = output_layer_data_dims[0];
		yolo->pOutTensor.nTensor[i].h = output_layer_data_dims[1];
		yolo->pOutTensor.nTensor[i].c = output_layer_data_dims[2];
		yolo->pOutTensor.nTensor[i].n = output_layer_data_dims[3];
		yolo->strides[i] = int(yolo->net_w / yolo->pOutTensor.nTensor[i].w);

		vx_size fl;
		ERROR_CHECK_STATUS(vxQueryTensor(layer_output, VX_TENSOR_FIXED_POINT_POSITION, &fl, sizeof(fl)));
		yolo->pOutTensor.nTensor[i].fl = fl;

		VX_LOG("%s Line:%d %dth layer output_data_dims:[%d,%d,%d,%d] fl:%d strides:[%d]\n", __FUNCTION__, __LINE__, i + 1, yolo->pOutTensor.nTensor[i].w, yolo->pOutTensor.nTensor[i].h, 
		yolo->pOutTensor.nTensor[i].c, yolo->pOutTensor.nTensor[i].n, yolo->pOutTensor.nTensor[i].fl, yolo->strides[i]);
	}

    return yolo;
}

static std::vector<Box> post_process(void *hdl, std::vector<std::string>& output_blobs_name,
		std::vector<vx_tensor>& output_blobs_tensor, float conf_thresh, float iou_thresh)
{
	nna_yolo_private_t *yolo = (nna_yolo_private_t *)hdl;
    std::map<std::string, int> output_blobs_name_to_index;
    for(size_t i = 0;i < output_blobs_name.size(); ++i) {
		output_blobs_name_to_index[output_blobs_name[i]] = i;
        // VX_LOG("%s output_blobs_name_to_index[%s]=%d\n", __FUNCTION__, output_blobs_name[i].c_str(), i);
	}

	unsigned int image_size = yolo->net_h;
	unsigned int *strides = yolo->strides;
	unsigned int class_num = yolo->label.size();
	
	std::vector<Box> proposals;
	for(size_t i = 0;i < yolo->yolo_outputs_name_g.size(); ++i){
		int layer_index = output_blobs_name_to_index[yolo->yolo_outputs_name_g[i]];
		vx_tensor layer_output = output_blobs_tensor[layer_index];
		vx_int8* layer_output_ptr;
		vx_size layer_output_dims[4]; // w,h,c,n
		vx_enum data_type;

		ERROR_CHECK_STATUS(vxQueryTensor(layer_output, VX_TENSOR_MEMORY_VIRT_ADDR, &layer_output_ptr, sizeof(layer_output_ptr)));
		ERROR_CHECK_STATUS(vxQueryTensor(layer_output, VX_TENSOR_DIMS, &layer_output_dims, sizeof(layer_output_dims)));
		ERROR_CHECK_STATUS(vxQueryTensor(layer_output, VX_TENSOR_DATA_TYPE, &data_type, sizeof(data_type)));
		
		// VX_LOG("%s Line:%d %s ptr:%d dim:[%d,%d,%d,%d] image_size:%d class_num:%d strides:%d dtype:%d\n", __FUNCTION__, __LINE__, yolo->yolo_outputs_name_g[layer_index].c_str(), layer_output_ptr, layer_output_dims[0]
		// , layer_output_dims[1], layer_output_dims[2], layer_output_dims[3], image_size, class_num, strides[layer_index], data_type);

		int status;
		if (yolo->model_type == "yolov5"){
			status = yolov5_layer(proposals, layer_output_ptr, layer_output_dims, image_size, strides[layer_index], yolo->pOutTensor.nTensor[layer_index].fl, data_type, yolo->anchors_g + i * 6, class_num, conf_thresh);
		} else if (yolo->model_type == "yolov8"){
			status = yolov8_layer(proposals, layer_output_ptr, layer_output_dims, image_size, strides[layer_index], yolo->pOutTensor.nTensor[layer_index].fl, data_type, class_num, conf_thresh);
		}
		
		if (status != VX_SUCCESS) {
			VX_LOG("%s Line:%d %dth layer yolov_layer error.\n", __FUNCTION__, __LINE__, i + 1);
		}
	}
	std::vector<Box> boxes;
	yolo_nms(proposals, iou_thresh);

	return proposals;
}


int nna_custom_det_process(void *hdl, ezax_img_t *pImgIn, ezax_boxes_t *yolo_det_out, float conf_thresh, float iou_thresh)
{
	VX_LOG("%s Line:%d conf_thresh:%f iou_thresh:%f\n", __FUNCTION__, __LINE__, conf_thresh, iou_thresh);
    nna_yolo_private_t *yolo = (nna_yolo_private_t *)hdl;

    input_image input_img;
    std::vector<std::string> input_blobs_name;
	std::vector<vx_tensor> input_blobs_tensor;
	std::vector<std::string> output_blobs_name;
	std::vector<vx_tensor> output_blobs_tensor;
	std::vector<Box> boxes;
    vx_status status;

	vx_size dst_virt_addr = vxGetDeviceMemoryVirtualAddress(yolo->dst_handle);
	vx_size dst_phy_addr = vxGetDeviceMemoryPhysicalAddress(yolo->dst_handle);
    vx_size dst_img_size = vxGetDeviceMemorySize(yolo->dst_handle);
    vx_size dst_phy_addr_uv = dst_phy_addr + yolo->net_w * yolo->net_h;

	img_cvt_param param;
    param.input_fmt = (img_fmt)pImgIn->img_handle.fmt;
    param.input_width = pImgIn->img_handle.w;
    param.input_height = pImgIn->img_handle.h;
    param.input_crop_x = 0;
    param.input_crop_y = 0;
    param.input_crop_w = pImgIn->img_handle.w;
    param.input_crop_h = pImgIn->img_handle.h;
    param.input_color_range = 0;
    // param.output_fmt = (img_fmt)pImgIn->img_handle.fmt;
	param.output_fmt = NNA_BGRA;
    param.output_width = yolo->net_w;
    param.output_height = yolo->net_h;
    param.output_crop_x = 0;
    param.output_crop_y = 0;
    param.output_crop_w = int(pImgIn->img_handle.w * yolo->scale_info);
    param.output_crop_h = int(pImgIn->img_handle.h * yolo->scale_info);
	status = ImageConvert(yolo->handle_context, &param, pImgIn->img_handle.pPhy, pImgIn->img_handle.pPhy_UV, yolo->dst_handle);
	vxInvalidateDeviceMemoryCache(yolo->handle_context, yolo->dst_handle);
	if (status != VX_SUCCESS) {
		VX_LOG("%s Line:%d ImageConvert error.\n", __FUNCTION__, __LINE__);
        return VX_FAILURE;
	} 
	// else VX_LOG("%s Line:%d ImageConvert success.\n", __FUNCTION__, __LINE__);
	// VX_LOG("%s Line:%d dst_phy_addr:%d dst_img_size:%d img_handle.w:%d img_handle.h:%d pImgIn->img_handle.sz:%d img_handle.c:%d img_handle.stride:%d fmt:%d\n", __FUNCTION__, __LINE__, dst_phy_addr, dst_img_size, pImgIn->img_handle.w, pImgIn->img_handle.h, pImgIn->img_handle.sz, pImgIn->img_handle.c, pImgIn->img_handle.stride, param.input_fmt);
	
    GetNetInputBlob(yolo->graph, input_blobs_name, input_blobs_tensor);
	status = ImportNetInputDataFromMem(yolo->graph, input_blobs_name[0], dst_img_size, (vx_uint8*)dst_virt_addr, 0);

	if (status != VX_SUCCESS) {
		VX_LOG("%s Line:%d import data error.\n", __FUNCTION__, __LINE__);
        return VX_FAILURE;
	} 
	// else VX_LOG("%s Line:%d import data success.\n", __FUNCTION__, __LINE__);
    input_blobs_name.clear();
	input_blobs_tensor.clear();

    ERROR_CHECK_STATUS(vxProcessGraph(yolo->graph));

    status = vxFinish(yolo->context);
	if (status != VX_SUCCESS) {
		VX_LOG("%s Line:%d vxFinish error.\n", __FUNCTION__, __LINE__);
		return VX_FAILURE;
	} 
	// else VX_LOG("%s Line:%d vxFinish success.\n", __FUNCTION__, __LINE__);
    status = GetNetOutputBlob(yolo->graph, output_blobs_name, output_blobs_tensor);
	if (status != VX_SUCCESS) {
		VX_LOG("%s Line:%d GetNetOutputBlob error.\n", __FUNCTION__, __LINE__);
		return VX_FAILURE;
	} 
	// else VX_LOG("%s Line:%d GetNetOutputBlob success.\n", __FUNCTION__, __LINE__);
	boxes = post_process(yolo, output_blobs_name, output_blobs_tensor, conf_thresh, iou_thresh);
	for (unsigned int i = 0; i < boxes.size(); i++) {
		Box box = boxes[i];
		ezax_rt_t *prect = &yolo_det_out->pRect[i];

		if (i >= MAX_CLASSIFICATION_DET_NUM) {
			break;
		}

		prect->x0 = (int)((box.xyxy[0]) / yolo->scale_info);
		prect->y0 = (int)((box.xyxy[1]) / yolo->scale_info);
		prect->x1 = (int)((box.xyxy[2]) / yolo->scale_info);
		prect->y1 = (int)((box.xyxy[3]) / yolo->scale_info);

		CLIPRETINA(prect->x0, 0, yolo->src_w);
		CLIPRETINA(prect->y0, 0, yolo->src_h);
		CLIPRETINA(prect->x1, 0, yolo->src_w);
		CLIPRETINA(prect->y1, 0, yolo->src_h);

		prect->s  = box.score;
		prect->c  = box.index;
		// VX_LOG("%s Line:%d final box x0:%d y0:%d x1:%d y1:%d score:%f c_index:%d\n", __FUNCTION__, __LINE__, prect->x0, prect->y0, prect->x1, prect->y1, prect->s, prect->c);
	}
	yolo_det_out->num = (boxes.size() > MAX_CLASSIFICATION_DET_NUM) ? MAX_CLASSIFICATION_DET_NUM : boxes.size();
    return VX_SUCCESS;
}

int nna_custom_det_close(void *hdl)
{
	nna_yolo_private_t *yolo = (nna_yolo_private_t *)hdl;
	vxInvalidateDeviceMemoryCacheAll(yolo->context);
	vxInvalidateDeviceMemoryCacheAll(yolo->handle_context);
	ERROR_CHECK_STATUS(vxReleaseGraph(&yolo->graph));
	ERROR_CHECK_STATUS(vxUnloadKernels(yolo->context, "openvx-nn"));
	ERROR_CHECK_STATUS(vxReleaseContext(&yolo->context));
	ERROR_CHECK_STATUS(vxReleaseContext(&yolo->handle_context));
	
	if (yolo)
		free(yolo);
	VX_LOG("%s Line:%d done.\n", __FUNCTION__, __LINE__);
	return VX_SUCCESS;
}

int  nna_custom_det_cmd(void *hdl, ezax_custom_det_cmd_t cmd, unsigned int args)
{
	nna_yolo_private_t *yolo = (nna_yolo_private_t *)hdl;
	// switch (cmd) {
	// 	ezax_freq_t *pfreq = (ezax_freq_t*)args;
	// 	case EZAX_YOLO_RESET_AXFREQ:
	// 	{
	// 		if(pfreq) {
	// 			int dev_id = pfreq->id;
	// 			int freq = pfreq->freq;
	// 			int pre_freq, set_freq;;
	// 			vxGetDeviceFreq(yolo->context, "nu", pre_freq);
	// 			vxSetDeviceFreq(yolo->context, "nu", freq);
	// 			vxGetDeviceFreq(yolo->context, "nu", set_freq);
	// 			VX_LOG("%s RESET_AXFREQ pre_freq:%d set_freq:%d\n", __FUNCTION__, pre_freq, set_freq);
	// 		}
	// 	}

	// 	case EZAX_YOLO_GET_AXFREQ:
	// 	{
			
	// 	}
	// }
	return VX_SUCCESS;
}