From bd44382e55fb4103a4e7662b72915bcd850a10bc Mon Sep 17 00:00:00 2001 From: mittorn Date: Tue, 8 Oct 2024 04:46:19 +0300 Subject: [PATCH] Use async chain of 4 frames with separate command buffers --- vaapi-recorder-hevc.cpp | 134 +++++++++++++- vaapi-recorder.h | 5 + vkcompute.cpp | 374 ++++++++++++++++++++++++---------------- 3 files changed, 360 insertions(+), 153 deletions(-) diff --git a/vaapi-recorder-hevc.cpp b/vaapi-recorder-hevc.cpp index 2d0fdf8..44183a5 100644 --- a/vaapi-recorder-hevc.cpp +++ b/vaapi-recorder-hevc.cpp @@ -106,6 +106,8 @@ enum NALUType { #define ALIGN16(x) ((x+15)&~15) +#define CHAIN_SIZE 4 + struct vaapi_recorder { int drm_fd, output_fd; int width, height; @@ -152,7 +154,7 @@ struct vaapi_recorder { VABufferID output_buf; VASurfaceID output_sync_surf; } encoder; - + VASurfaceID inputFrames[CHAIN_SIZE]; }; /* @@ -2334,6 +2336,131 @@ err_free: return NULL; } +struct vaapi_recorder * +vaapi_recorder_create5(int drm_fd, int width, int height, const char *filename, int *dmabuf_fd, uint64_t *mod, uint32_t *size, uint32_t *offset, uint32_t *pitch1, uint32_t *pitch2, uint64_t *modifiers, int modifierscount) +{ + struct vaapi_recorder *r; + VAStatus status; + int major, minor; + int flags; + VASurfaceAttribExternalBuffers va_attrib_extbuf = {0}; + VADRMPRIMESurfaceDescriptor drmSurface = {0}; + + r = (vaapi_recorder*)calloc(sizeof *r,1); + if (r == NULL) + return NULL; + + r->width = width; + r->height = height; + r->drm_fd = drm_fd; + + flags = O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC; + r->output_fd = open(filename, flags, 0644); + if (r->output_fd < 0) + goto err_thread; + + r->va_dpy = vaGetDisplayDRM(drm_fd); + if (!r->va_dpy) { + printf("failed to create VA display\n"); + goto err_fd; + } + + status = vaInitialize(r->va_dpy, &major, &minor); + if (status != VA_STATUS_SUCCESS) { + printf("vaapi: failed to initialize display\n"); + goto err_fd; + } + + if (setup_vpp(r) < 0) { + printf("vaapi: failed to initialize VPP pipeline\n"); + goto err_va_dpy; + } + + if (setup_encoder(r) < 0) { + goto err_vpp; + } + //create_surface_from_fd(r, dmabuf_fd, dmabuf_stride, &gInputRGBA); + VASurfaceAttrib va_attribs[5]; + + + + + //unsigned long buffer_fd[2] = {dmabuf_fd, dmabuf_fd_uv}; + + va_attrib_extbuf.pixel_format = VA_FOURCC_P010; + va_attrib_extbuf.width = r->width; + va_attrib_extbuf.height = r->height; + //va_attrib_extbuf.data_size = r->height * stride; + //va_attrib_extbuf.num_planes = 1; + //va_attrib_extbuf.pitches[0] = stride; + //va_attrib_extbuf.offsets[0] = 0; + //va_attrib_extbuf.buffers = &buffer_fd; + //va_attrib_extbuf.num_buffers = 1; + va_attrib_extbuf.flags = VA_SURFACE_EXTBUF_DESC_ENABLE_TILING; + va_attrib_extbuf.private_data = NULL; + VADRMFormatModifierList modList; + modList.modifiers = modifiers; + modList.num_modifiers = modifierscount; + + va_attribs[0].type = VASurfaceAttribMemoryType; + va_attribs[0].flags = VA_SURFACE_ATTRIB_SETTABLE; + va_attribs[0].value.type = VAGenericValueTypeInteger; + va_attribs[0].value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_VA; + va_attribs[1].type = VASurfaceAttribUsageHint; + va_attribs[1].flags = VA_SURFACE_ATTRIB_SETTABLE; + va_attribs[1].value.type = VAGenericValueTypeInteger; + va_attribs[1].value.value.i = VA_SURFACE_ATTRIB_USAGE_HINT_EXPORT | VA_SURFACE_ATTRIB_USAGE_HINT_ENCODER; + va_attribs[2].type = VASurfaceAttribPixelFormat; + va_attribs[2].flags = VA_SURFACE_ATTRIB_SETTABLE; + va_attribs[2].value.type = VAGenericValueTypeInteger; + va_attribs[2].value.value.i = VA_FOURCC_P010; + va_attribs[3].type = VASurfaceAttribExternalBufferDescriptor; + va_attribs[3].flags = VA_SURFACE_ATTRIB_SETTABLE; + va_attribs[3].value.type = VAGenericValueTypePointer; + va_attribs[3].value.value.p = &va_attrib_extbuf; + va_attribs[4].type = VASurfaceAttribDRMFormatModifiers; + va_attribs[4].flags = VA_SURFACE_ATTRIB_SETTABLE; + va_attribs[4].value.type = VAGenericValueTypePointer; + va_attribs[4].value.value.p = &modList; + + status = vaCreateSurfaces(r->va_dpy, VA_RT_FORMAT_YUV420_10, + r->width, r->height, r->inputFrames, CHAIN_SIZE, + &va_attribs[0], 5); + printf("%d\n", status); + for(int i = 0; i < CHAIN_SIZE; i++) + { + status = vaExportSurfaceHandle(r->va_dpy, r->inputFrames[i], VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2, VA_EXPORT_SURFACE_WRITE_ONLY | VA_EXPORT_SURFACE_SEPARATE_LAYERS, &drmSurface ); + printf("%d %d %llx\n", status, drmSurface.objects[0].fd, drmSurface.objects[0].drm_format_modifier); + dmabuf_fd[i] = drmSurface.objects[0].fd; + if(status != VA_STATUS_SUCCESS) + exit(1); + } + *mod = drmSurface.objects[0].drm_format_modifier; + *size = drmSurface.objects[0].size; + *offset = drmSurface.layers[1].offset[0]; + *pitch1 = drmSurface.layers[0].pitch[0]; + *pitch2 = drmSurface.layers[1].pitch[0]; + + + r->encoder.output_buf = VA_INVALID_ID; + setup_output_thread(r); + + return r; + +err_vpp: + vpp_destroy(r); +err_va_dpy: + vaTerminate(r->va_dpy); +err_fd: + close(r->output_fd); +err_thread: + destroy_worker_thread(r); +err_free: + free(r); + + return NULL; +} + void vaapi_recorder_destroy(struct vaapi_recorder *r) @@ -2479,6 +2606,11 @@ recorder_frame3(struct vaapi_recorder *r) encoder_encode(r, r->vpp.output); } +void +recorder_frame4(struct vaapi_recorder *r, int idx) +{ + encoder_encode(r, r->inputFrames[idx]); +} static void * diff --git a/vaapi-recorder.h b/vaapi-recorder.h index 4cb0f2c..9529f38 100644 --- a/vaapi-recorder.h +++ b/vaapi-recorder.h @@ -38,6 +38,9 @@ vaapi_recorder_create3(int drm_fd, int width, int height, const char *filename, struct vaapi_recorder * vaapi_recorder_create4(int drm_fd, int width, int height, const char *filename, int *dmabuf_fd, uint64_t *mod, uint32_t *size, uint32_t *offset, uint32_t *pitch1, uint32_t *pitch2, uint64_t *modifiers, int modifierscount); +struct vaapi_recorder * +vaapi_recorder_create5(int drm_fd, int width, int height, const char *filename, int *dmabuf_fd, uint64_t *mod, uint32_t *size, uint32_t *offset, uint32_t *pitch1, uint32_t *pitch2, uint64_t *modifiers, int modifierscount); + void vaapi_recorder_destroy(struct vaapi_recorder *r); @@ -48,5 +51,7 @@ void recorder_frame2(struct vaapi_recorder *r); void recorder_frame3(struct vaapi_recorder *r); +void +recorder_frame4(struct vaapi_recorder *r, int idx); #endif /* _VAAPI_RECORDER_H_ */ diff --git a/vkcompute.cpp b/vkcompute.cpp index 5c9c1d2..4877d57 100644 --- a/vkcompute.cpp +++ b/vkcompute.cpp @@ -141,14 +141,13 @@ const bool enableValidationLayers = true; assert(res == VK_SUCCESS); \ } \ } - +#define CHAIN_SIZE 4 /* The application launches a compute shader that renders the mandelbrot set, by rendering it into a storage buffer. The storage buffer is then read from the GPU, and saved as .png. */ -class ComputeApplication { -private: +struct ComputeApplication { // The pixels of the rendered mandelbrot set are in this format: struct Pixel { float r, g, b, a; @@ -186,7 +185,6 @@ private: To allocate such command buffers, we use a command pool. */ VkCommandPool commandPool; - VkCommandBuffer commandBuffer; /* @@ -197,25 +195,40 @@ private: into descriptor sets, which are basically just collections of descriptors. */ VkDescriptorPool descriptorPool; - VkDescriptorSet descriptorSet; VkDescriptorSetLayout descriptorSetLayout; + VkCommandBuffer commandBuffer; + + struct UBO{ + float frameNum; + }; + struct FrameContext + { + VkDescriptorSet descriptorSet; + VkBuffer ubo; + VkDeviceMemory uboMemory; + VkImage image0; + VkDeviceMemory imageMemory0; + VkImageView imageView0; + VkImage image1; + // todo: single memory block? + VkDeviceMemory imageMemory1; + VkImageView imageView1; + VkCommandBuffer commandBuffer; + UBO *pMappedUBO = NULL; + VkFence fence; + bool running; + + } chain[CHAIN_SIZE]; /* The mandelbrot set will be rendered to this buffer. The memory that backs the buffer is bufferMemory. */ - VkBuffer buffer; - VkDeviceMemory bufferMemory; - VkImage image0; - VkDeviceMemory imageMemory0; - VkImageView imageView0; - VkImage image1; - // todo: single memory block? - VkDeviceMemory imageMemory1; - VkImageView imageView1; - uint32_t bufferSize; // size of `buffer` in bytes. + + + //uint32_t bufferSize; // size of `buffer` in bytes. const char * enabledLayers[16]; size_t enabledLayersCount = 0; @@ -239,93 +252,6 @@ private: This variable keeps track of the index of that queue in its family. */ uint32_t queueFamilyIndex; - struct UBO{ - float frameNum; - }; - UBO *pMappedBuffer = NULL; - -public: - void run() { - // Buffer size of the storage buffer that will contain the rendered mandelbrot set. - bufferSize = sizeof(Pixel) * WIDTH * HEIGHT; - - - // Initialize vulkan: - createInstance(); - findPhysicalDevice(); - createDevice(); - createBuffer(); - vkMapMemory(device, bufferMemory, 0, sizeof(UBO), 0, (void**)&pMappedBuffer); - //createImageExportableDmabuf(image0, imageView0, imageMemory0, prime_fd, WIDTH, HEIGHT, VK_FORMAT_R8_UNORM); - //createImageExportableDmabuf(image1, imageView1, imageMemory1, prime_fd_uv, WIDTH/2, HEIGHT/2, VK_FORMAT_R8G8_UNORM); - int drm_fd = drm_fd = open("/dev/dri/renderD128", O_RDWR); - //auto *r = vaapi_recorder_create2(drm_fd, WIDTH, HEIGHT, "out.264", prime_fd, WIDTH * 4); - //auto *r = vaapi_recorder_create3(drm_fd, WIDTH, HEIGHT, "out.264", prime_fd, WIDTH * 4, prime_fd_uv, WIDTH * 2); - uint64_t mod; - uint32_t size, offset, pitch1, pitch2; - int fd; - uint64_t modifiers[32]; - int count = getAvailiableModifiersList(modifiers, 32, VK_FORMAT_R16_UNORM); - auto *r = vaapi_recorder_create4(drm_fd, WIDTH, HEIGHT, "out.264", &fd, &mod, &size, &offset, &pitch1, &pitch2, modifiers, count); - createImageDumbDmabuf2(image0, imageView0, imageMemory0, fd, mod, size, offset, pitch1, pitch2); - createDescriptorSetLayout(); - //createDescriptorSet(); - createComputePipeline(); - createCommandBuffer(); - int frameNum = 0; - - /* - We create a fence. - */ - VkFenceCreateInfo fenceCreateInfo = {}; - fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; - fenceCreateInfo.flags = 0; - VK_CHECK_RESULT(vkCreateFence(device, &fenceCreateInfo, NULL, &fence)); - - while(frameNum++ < 1000) - { - // Finally, run the recorded command buffer. - runCommandBuffer(); - //usleep(10000); - recorder_frame3(r); - pMappedBuffer->frameNum = frameNum; - //usleep(10000); - } - - vkDestroyFence(device, fence, NULL); - vkUnmapMemory(device, bufferMemory); - - // The former command rendered a mandelbrot set to a buffer. - // Save that buffer as a png on disk. - //saveRenderedImage(); - - // Clean up all vulkan resources. - cleanup(); - } - - void saveRenderedImage() { - void* mappedMemory = NULL; -#if 1 - // Map the buffer memory, so that we can read from it on the CPU. - vkMapMemory(device, imageMemory0, 0, VK_WHOLE_SIZE, 0, &mappedMemory); - //Pixel* pmappedMemory = (Pixel *)mappedMemory; - FILE *f = fopen("out.bin","wb"); - fwrite(mappedMemory, 4, WIDTH * HEIGHT, f); - fclose(f); - - // Done reading, so unmap. - vkUnmapMemory(device, imageMemory0); -#else - static char mem[WIDTH * HEIGHT*4]; - memcpy(mem, gDrm.mapped_buffer, WIDTH * HEIGHT * 4); - // Now we save the acquired color data to a .png. -// unsigned error = lodepng::encode("mandelbrot.png", image, WIDTH, HEIGHT); - //if (error) printf("encoder error %d: %s", error, lodepng_error_text(error)); - FILE *f = fopen("out.bin","wb"); - fwrite(mem, 4, WIDTH * HEIGHT, f); - fclose(f); -#endif - } static VKAPI_ATTR VkBool32 VKAPI_CALL debugReportCallbackFn( VkDebugReportFlagsEXT flags, @@ -612,7 +538,7 @@ public: return -1; } - void createBuffer() { + void createUBO(int chidx) { /* We will now create a buffer. We will render the mandelbrot set into this buffer in a computer shade later. @@ -620,11 +546,11 @@ public: VkBufferCreateInfo bufferCreateInfo = {}; bufferCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - bufferCreateInfo.size = bufferSize; // buffer size in bytes. + bufferCreateInfo.size = sizeof(UBO); // buffer size in bytes. bufferCreateInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT; // buffer is used as a storage buffer. bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; // buffer is exclusive to a single queue family at a time. - VK_CHECK_RESULT(vkCreateBuffer(device, &bufferCreateInfo, NULL, &buffer)); // create buffer. + VK_CHECK_RESULT(vkCreateBuffer(device, &bufferCreateInfo, NULL, &chain[chidx].ubo)); // create buffer. /* But the buffer doesn't allocate memory for itself, so we must do that manually. @@ -634,7 +560,7 @@ public: First, we find the memory requirements for the buffer. */ VkMemoryRequirements memoryRequirements; - vkGetBufferMemoryRequirements(device, buffer, &memoryRequirements); + vkGetBufferMemoryRequirements(device, chain[chidx].ubo, &memoryRequirements); /* Now use obtained memory requirements info to allocate the memory for the buffer. @@ -655,10 +581,11 @@ public: allocateInfo.memoryTypeIndex = findMemoryType( memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); - VK_CHECK_RESULT(vkAllocateMemory(device, &allocateInfo, NULL, &bufferMemory)); // allocate memory on device. + VK_CHECK_RESULT(vkAllocateMemory(device, &allocateInfo, NULL, &chain[chidx].uboMemory)); // allocate memory on device. // Now associate that allocated memory with the buffer. With that, the buffer is backed by actual memory. - VK_CHECK_RESULT(vkBindBufferMemory(device, buffer, bufferMemory, 0)); + VK_CHECK_RESULT(vkBindBufferMemory(device, chain[chidx].ubo, chain[chidx].uboMemory, 0)); + vkMapMemory(device, chain[chidx].uboMemory, 0, sizeof(UBO), 0, (void**)&chain[chidx].pMappedUBO); } int getAvailiableModifiersList(uint64_t *modifiers2, size_t len, VkFormat fmt) { @@ -770,7 +697,7 @@ public: } // create and import dmabuf - void createImageDumbDmabuf2(VkImage &image, VkImageView &imageView, VkDeviceMemory &imageMemory, int fd, uint64_t mod, uint32_t size, uint32_t offset, uint32_t pitch1, uint32_t pitch2) { + void createImageDumbDmabuf2(VkImage &image, VkImageView &imageView, VkDeviceMemory &imageMemory, VkImage &image1, VkImageView &imageView1, VkDeviceMemory &imageMemory1, int fd, uint64_t mod, uint32_t size, uint32_t offset, uint32_t pitch1, uint32_t pitch2) { /* We will now create a buffer. We will render the mandelbrot set into this buffer in a computer shade later. @@ -871,7 +798,7 @@ public: view.image = image1; VK_CHECK_RESULT(vkCreateImageView(device, &view, nullptr, &imageView1)); } - +#if 0 // create and import dmabuf as opaque fd, allows any tiling void createImageDumbOpaque(VkImage &image, VkImageView &imageView, VkDeviceMemory &imageMemory) { /* @@ -1051,7 +978,7 @@ public: printf("imageModifier %llx\n", imageModifiers.drmFormatModifier); // todo: get subresource plane info (vkGetImageSubresourceLayout) } - +#endif void createDescriptorSetLayout() { /* Here we specify a descriptor set layout. This allows us to bind our descriptors to @@ -1090,8 +1017,8 @@ public: // Create the descriptor set layout. VK_CHECK_RESULT(vkCreateDescriptorSetLayout(device, &descriptorSetLayoutCreateInfo, NULL, &descriptorSetLayout)); } - - void createDescriptorSet() { + void createDescriptorPool() + { /* So we will allocate a descriptor set here. But we need to first create a descriptor pool to do that. @@ -1102,18 +1029,20 @@ public: */ VkDescriptorPoolSize descriptorPoolSize[2] = {}; descriptorPoolSize[0].type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - descriptorPoolSize[0].descriptorCount = 2; + descriptorPoolSize[0].descriptorCount = 2*CHAIN_SIZE; descriptorPoolSize[1].type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER; - descriptorPoolSize[1].descriptorCount = 1; + descriptorPoolSize[1].descriptorCount = 1*CHAIN_SIZE; VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = {}; descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; - descriptorPoolCreateInfo.maxSets = 1; // we only need to allocate one descriptor set from the pool. + descriptorPoolCreateInfo.maxSets = CHAIN_SIZE; // we only need to allocate one descriptor set from the pool. descriptorPoolCreateInfo.poolSizeCount = 2; descriptorPoolCreateInfo.pPoolSizes = descriptorPoolSize; // create descriptor pool. VK_CHECK_RESULT(vkCreateDescriptorPool(device, &descriptorPoolCreateInfo, NULL, &descriptorPool)); + } + void createDescriptorSet(int chidx) { /* With the pool allocated, we can now allocate the descriptor set. */ @@ -1124,7 +1053,7 @@ public: descriptorSetAllocateInfo.pSetLayouts = &descriptorSetLayout; // allocate descriptor set. - VK_CHECK_RESULT(vkAllocateDescriptorSets(device, &descriptorSetAllocateInfo, &descriptorSet)); + VK_CHECK_RESULT(vkAllocateDescriptorSets(device, &descriptorSetAllocateInfo, &chain[chidx].descriptorSet)); /* Next, we need to connect our actual storage buffer with the descrptor. @@ -1134,29 +1063,29 @@ public: // Specify the buffer to bind to the descriptor. VkDescriptorImageInfo descriptorImageInfo[2] = {}; - descriptorImageInfo[0].imageView = imageView0; + descriptorImageInfo[0].imageView = chain[chidx].imageView0; descriptorImageInfo[0].imageLayout = VK_IMAGE_LAYOUT_GENERAL; - descriptorImageInfo[1].imageView = imageView1; + descriptorImageInfo[1].imageView = chain[chidx].imageView1; descriptorImageInfo[1].imageLayout = VK_IMAGE_LAYOUT_GENERAL; VkWriteDescriptorSet writeDescriptorSet[2] = {}; writeDescriptorSet[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSet[0].dstSet = descriptorSet; // write to this descriptor set. + writeDescriptorSet[0].dstSet = chain[chidx].descriptorSet; // write to this descriptor set. writeDescriptorSet[0].dstBinding = 0; // write to the first, and only binding. writeDescriptorSet[0].descriptorCount = 1; // update a single descriptor. writeDescriptorSet[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; // storage buffer. writeDescriptorSet[0].pImageInfo = &descriptorImageInfo[0]; writeDescriptorSet[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSet[1].dstSet = descriptorSet; // write to this descriptor set. + writeDescriptorSet[1].dstSet = chain[chidx].descriptorSet; // write to this descriptor set. writeDescriptorSet[1].dstBinding = 1; // write to the first, and only binding. writeDescriptorSet[1].descriptorCount = 1; // update a single descriptor. writeDescriptorSet[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; // storage buffer. writeDescriptorSet[1].pImageInfo = &descriptorImageInfo[1]; VkDescriptorBufferInfo descriptorBufferInfo = {}; - descriptorBufferInfo.buffer = buffer; + descriptorBufferInfo.buffer = chain[chidx].ubo; descriptorBufferInfo.offset = 0; - descriptorBufferInfo.range = bufferSize; + descriptorBufferInfo.range = sizeof(UBO); // perform the update of the descriptor set. vkUpdateDescriptorSets(device, 2, writeDescriptorSet, 0, NULL); writeDescriptorSet[0].dstBinding = 2; @@ -1252,8 +1181,8 @@ public: 1, &pipelineCreateInfo, NULL, &pipeline)); } - - void createCommandBuffer() { + void createCommandPool() + { /* We are getting closer to the end. In order to send commands to the device(GPU), we must first record commands into a command buffer. @@ -1261,12 +1190,11 @@ public: */ VkCommandPoolCreateInfo commandPoolCreateInfo = {}; commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; - commandPoolCreateInfo.flags = 0; + commandPoolCreateInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; // the queue family of this command pool. All command buffers allocated from this command pool, // must be submitted to queues of this family ONLY. commandPoolCreateInfo.queueFamilyIndex = queueFamilyIndex; VK_CHECK_RESULT(vkCreateCommandPool(device, &commandPoolCreateInfo, NULL, &commandPool)); - /* Now allocate a command buffer from the command pool. */ @@ -1280,48 +1208,94 @@ public: commandBufferAllocateInfo.commandBufferCount = 1; // allocate a single command buffer. VK_CHECK_RESULT(vkAllocateCommandBuffers(device, &commandBufferAllocateInfo, &commandBuffer)); // allocate command buffer. - /* - Now we shall start recording commands into the newly allocated command buffer. - */ + } + void prepareImage(int chidx) + { VkCommandBufferBeginInfo beginInfo = {}; beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; beginInfo.flags = 0; // the buffer is only submitted and used once in this application. VK_CHECK_RESULT(vkBeginCommandBuffer(commandBuffer, &beginInfo)); // start recording commands. - /* - We need to bind a pipeline, AND a descriptor set before we dispatch. - The validation layer will NOT give warnings if you forget these, so be very careful not to forget them. - */ vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); VkImageMemoryBarrier imageMemoryBarrier = {VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER}; imageMemoryBarrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; imageMemoryBarrier.newLayout = VK_IMAGE_LAYOUT_GENERAL; - imageMemoryBarrier.image = image0; + imageMemoryBarrier.image = chain[chidx].image0; imageMemoryBarrier.subresourceRange = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }; // imageMemoryBarrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; // imageMemoryBarrier.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT; vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, nullptr, 0, nullptr, 1, &imageMemoryBarrier); - imageMemoryBarrier.image = image1; + imageMemoryBarrier.image = chain[chidx].image1; vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 0, nullptr, 0, nullptr, 1, &imageMemoryBarrier); + VK_CHECK_RESULT(vkEndCommandBuffer(commandBuffer)); // end recording commands. + VkFence fence; + VkFenceCreateInfo fenceCreateInfo = {}; + fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; + fenceCreateInfo.flags = 0; + VK_CHECK_RESULT(vkCreateFence(device, &fenceCreateInfo, NULL, &fence)); - createDescriptorSet(); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout, 0, 1, &descriptorSet, 0, NULL); + VkSubmitInfo submitInfo = {}; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submitInfo.commandBufferCount = 1; // submit a single command buffer + submitInfo.pCommandBuffers = &commandBuffer; // the command buffer to submit. + VK_CHECK_RESULT(vkQueueSubmit(queue, 1, &submitInfo, fence)); + VK_CHECK_RESULT(vkWaitForFences(device, 1, &fence, VK_TRUE, 100000000000)); + vkDestroyFence(device, fence, NULL); + vkResetCommandBuffer(commandBuffer, 0); + } + + void createCommandBuffer(int chidx) { + + /* + Now allocate a command buffer from the command pool. + */ + VkCommandBufferAllocateInfo commandBufferAllocateInfo = {}; + commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + commandBufferAllocateInfo.commandPool = commandPool; // specify the command pool to allocate from. + // if the command buffer is primary, it can be directly submitted to queues. + // A secondary buffer has to be called from some primary command buffer, and cannot be directly + // submitted to a queue. To keep things simple, we use a primary command buffer. + commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + commandBufferAllocateInfo.commandBufferCount = 1; // allocate a single command buffer. + VK_CHECK_RESULT(vkAllocateCommandBuffers(device, &commandBufferAllocateInfo, &chain[chidx].commandBuffer)); // allocate command buffer. + VkCommandBufferBeginInfo beginInfo = {}; + beginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + beginInfo.flags = 0; // the buffer is only submitted and used once in this application. + VK_CHECK_RESULT(vkBeginCommandBuffer(chain[chidx].commandBuffer, &beginInfo)); // start recording commands. + /* + We need to bind a pipeline, AND a descriptor set before we dispatch. + + The validation layer will NOT give warnings if you forget these, so be very careful not to forget them. + */ + vkCmdBindPipeline(chain[chidx].commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); + /* + Now we shall start recording commands into the newly allocated command buffer. + */ + vkCmdBindDescriptorSets(chain[chidx].commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipelineLayout, 0, 1, &chain[chidx].descriptorSet, 0, NULL); /* Calling vkCmdDispatch basically starts the compute pipeline, and executes the compute shader. The number of workgroups is specified in the arguments. If you are already familiar with compute shaders from OpenGL, this should be nothing new to you. */ - vkCmdDispatch(commandBuffer, (uint32_t)ceil(WIDTH/2 / float(WORKGROUP_SIZE)), (uint32_t)ceil(HEIGHT/2 / float(WORKGROUP_SIZE)), 1); + vkCmdDispatch(chain[chidx].commandBuffer, (uint32_t)ceil(WIDTH/2 / float(WORKGROUP_SIZE)), (uint32_t)ceil(HEIGHT/2 / float(WORKGROUP_SIZE)), 1); + + VK_CHECK_RESULT(vkEndCommandBuffer(chain[chidx].commandBuffer)); // end recording commands. + /* + We create a fence. + */ + VkFenceCreateInfo fenceCreateInfo = {}; + fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; + fenceCreateInfo.flags = 0; + VK_CHECK_RESULT(vkCreateFence(device, &fenceCreateInfo, NULL, &chain[chidx].fence)); - VK_CHECK_RESULT(vkEndCommandBuffer(commandBuffer)); // end recording commands. } - VkFence fence; - void runCommandBuffer() { + + void runCommandBuffer(int chidx) { /* Now we shall finally submit the recorded command buffer to a queue. */ @@ -1329,18 +1303,22 @@ public: VkSubmitInfo submitInfo = {}; submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; submitInfo.commandBufferCount = 1; // submit a single command buffer - submitInfo.pCommandBuffers = &commandBuffer; // the command buffer to submit. + submitInfo.pCommandBuffers = &chain[chidx].commandBuffer; // the command buffer to submit. // static bool b; //if(b) - //VK_CHECK_RESULT(vkWaitForFences(device, 1, &fence, VK_TRUE, 100000000000)); + // //b = 1; - vkResetFences(device, 1, &fence); + if(chain[chidx].running) + VK_CHECK_RESULT(vkWaitForFences(device, 1, &chain[chidx].fence, VK_TRUE, 100000000000)); + + vkResetFences(device, 1, &chain[chidx].fence); /* We submit the command buffer on the queue, at the same time giving a fence. */ - VK_CHECK_RESULT(vkQueueSubmit(queue, 1, &submitInfo, fence)); + VK_CHECK_RESULT(vkQueueSubmit(queue, 1, &submitInfo, chain[chidx].fence)); + chain[chidx].running = true; /* The command will not have finished executing until the fence is signalled. So we wait here. @@ -1348,9 +1326,101 @@ public: and we will not be sure that the command has finished executing unless we wait for the fence. Hence, we use a fence here. */ - VK_CHECK_RESULT(vkWaitForFences(device, 1, &fence, VK_TRUE, 100000000000)); + //VK_CHECK_RESULT(vkWaitForFences(device, 1, &fence, VK_TRUE, 100000000000)); } + void run() { + // Buffer size of the storage buffer that will contain the rendered mandelbrot set. + //bufferSize = sizeof(Pixel) * WIDTH * HEIGHT; + + // Initialize vulkan: + createInstance(); + findPhysicalDevice(); + createDevice(); + + //createImageExportableDmabuf(image0, imageView0, imageMemory0, prime_fd, WIDTH, HEIGHT, VK_FORMAT_R8_UNORM); + //createImageExportableDmabuf(image1, imageView1, imageMemory1, prime_fd_uv, WIDTH/2, HEIGHT/2, VK_FORMAT_R8G8_UNORM); + int drm_fd = drm_fd = open("/dev/dri/renderD128", O_RDWR); + //auto *r = vaapi_recorder_create2(drm_fd, WIDTH, HEIGHT, "out.264", prime_fd, WIDTH * 4); + //auto *r = vaapi_recorder_create3(drm_fd, WIDTH, HEIGHT, "out.264", prime_fd, WIDTH * 4, prime_fd_uv, WIDTH * 2); + uint64_t mod; + uint32_t size, offset, pitch1, pitch2; + int fd[CHAIN_SIZE]; + uint64_t modifiers[32]; + int count = getAvailiableModifiersList(modifiers, 32, VK_FORMAT_R16_UNORM); + auto *r = vaapi_recorder_create5(drm_fd, WIDTH, HEIGHT, "out.264", fd, &mod, &size, &offset, &pitch1, &pitch2, modifiers, count); + for(int i = 0; i < CHAIN_SIZE; i++) + { + createUBO(i); + createImageDumbDmabuf2(chain[i].image0, chain[i].imageView0, chain[i].imageMemory0, chain[i].image1, chain[i].imageView1, chain[i].imageMemory1, + fd[i], mod, size, offset, pitch1, pitch2); + } + + createDescriptorSetLayout(); + createDescriptorPool(); + createComputePipeline(); + createCommandPool(); + for(int i = 0; i < CHAIN_SIZE; i++) + { + prepareImage(i); + createDescriptorSet(i); + createCommandBuffer(i); + } + int frameNum = 0; + + while(frameNum++ < 1000) + { + int chidx = frameNum & 3; + // Finally, run the recorded command buffer. + runCommandBuffer(chidx); + //usleep(10000); + recorder_frame4(r, chidx); + chain[chidx].pMappedUBO->frameNum = frameNum; + //usleep(10000); + + } + for(int i = 0; i < CHAIN_SIZE; i++) + { + if(chain[i].running) + VK_CHECK_RESULT(vkWaitForFences(device, 1, &chain[i].fence, VK_TRUE, 100000000000)); + } + exit(0); + + //vkDestroyFence(device, fence, NULL); + //vkUnmapMemory(device, bufferMemory); + + // The former command rendered a mandelbrot set to a buffer. + // Save that buffer as a png on disk. + //saveRenderedImage(); + + // Clean up all vulkan resources. + cleanup(); + } +#if 0 + void saveRenderedImage() { + void* mappedMemory = NULL; +#if 1 + // Map the buffer memory, so that we can read from it on the CPU. + vkMapMemory(device, imageMemory0, 0, VK_WHOLE_SIZE, 0, &mappedMemory); + //Pixel* pmappedMemory = (Pixel *)mappedMemory; + FILE *f = fopen("out.bin","wb"); + fwrite(mappedMemory, 4, WIDTH * HEIGHT, f); + fclose(f); + + // Done reading, so unmap. + vkUnmapMemory(device, imageMemory0); +#else + static char mem[WIDTH * HEIGHT*4]; + memcpy(mem, gDrm.mapped_buffer, WIDTH * HEIGHT * 4); + // Now we save the acquired color data to a .png. +// unsigned error = lodepng::encode("mandelbrot.png", image, WIDTH, HEIGHT); + //if (error) printf("encoder error %d: %s", error, lodepng_error_text(error)); + FILE *f = fopen("out.bin","wb"); + fwrite(mem, 4, WIDTH * HEIGHT, f); + fclose(f); +#endif + } +#endif void cleanup() { /* Clean up all Vulkan Resources. @@ -1366,11 +1436,11 @@ public: func(instance, debugReportCallback, NULL); } - vkFreeMemory(device, bufferMemory, NULL); - vkDestroyBuffer(device, buffer, NULL); - vkFreeMemory(device, imageMemory0, NULL); - vkDestroyImageView(device, imageView0, NULL); - vkDestroyImage(device, image0, NULL); + //vkFreeMemory(device, bufferMemory, NULL); + //vkDestroyBuffer(device, buffer, NULL); + //vkFreeMemory(device, imageMemory0, NULL); + //vkDestroyImageView(device, imageView0, NULL); + //vkDestroyImage(device, image0, NULL); vkDestroyShaderModule(device, computeShaderModule, NULL); vkDestroyDescriptorPool(device, descriptorPool, NULL); vkDestroyDescriptorSetLayout(device, descriptorSetLayout, NULL);