first commit

2022-09-15 09:26:49 +07:00
commit df3dd9a705
168 changed files with 67159 additions and 0 deletions
--- a/lib_ncnn/ncnn/allocator.h
+++ b/lib_ncnn/ncnn/allocator.h
@@ -0,0 +1,440 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_ALLOCATOR_H
+#define NCNN_ALLOCATOR_H
+
+#ifdef _WIN32
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+#include "platform.h"
+
+#include <stdlib.h>
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+#include <android/hardware_buffer.h>
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// the alignment of all the allocated buffers
+#if NCNN_AVX512
+#define NCNN_MALLOC_ALIGN 64
+#elif NCNN_AVX
+#define NCNN_MALLOC_ALIGN 32
+#else
+#define NCNN_MALLOC_ALIGN 16
+#endif
+
+// we have some optimized kernels that may overread buffer a bit in loop
+// it is common to interleave next-loop data load with arithmetic instructions
+// allocating more bytes keeps us safe from SEGV_ACCERR failure
+#define NCNN_MALLOC_OVERREAD 64
+
+// Aligns a pointer to the specified number of bytes
+// ptr Aligned pointer
+// n Alignment size that must be a power of two
+template<typename _Tp>
+static NCNN_FORCEINLINE _Tp* alignPtr(_Tp* ptr, int n = (int)sizeof(_Tp))
+{
+    return (_Tp*)(((size_t)ptr + n - 1) & -n);
+}
+
+// Aligns a buffer size to the specified number of bytes
+// The function returns the minimum number that is greater or equal to sz and is divisible by n
+// sz Buffer size to align
+// n Alignment size that must be a power of two
+static NCNN_FORCEINLINE size_t alignSize(size_t sz, int n)
+{
+    return (sz + n - 1) & -n;
+}
+
+static NCNN_FORCEINLINE void* fastMalloc(size_t size)
+{
+#if _MSC_VER
+    return _aligned_malloc(size, NCNN_MALLOC_ALIGN);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+    void* ptr = 0;
+    if (posix_memalign(&ptr, NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD))
+        ptr = 0;
+    return ptr;
+#elif __ANDROID__ && __ANDROID_API__ < 17
+    return memalign(NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD);
+#else
+    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + NCNN_MALLOC_ALIGN + NCNN_MALLOC_OVERREAD);
+    if (!udata)
+        return 0;
+    unsigned char** adata = alignPtr((unsigned char**)udata + 1, NCNN_MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+#endif
+}
+
+static NCNN_FORCEINLINE void fastFree(void* ptr)
+{
+    if (ptr)
+    {
+#if _MSC_VER
+        _aligned_free(ptr);
+#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
+        free(ptr);
+#elif __ANDROID__ && __ANDROID_API__ < 17
+        free(ptr);
+#else
+        unsigned char* udata = ((unsigned char**)ptr)[-1];
+        free(udata);
+#endif
+    }
+}
+
+#if NCNN_THREADS
+// exchange-add operation for atomic operations on reference counters
+#if defined __riscv && !defined __riscv_atomic
+// riscv target without A extension
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#elif defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
+// atomic increment on the linux version of the Intel(tm) compiler
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
+#elif defined __GNUC__
+#if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
+#ifdef __ATOMIC_ACQ_REL
+#define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
+#endif
+#else
+#if defined __ATOMIC_ACQ_REL && !defined __clang__
+// version for gcc >= 4.7
+#define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
+#else
+#define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
+#endif
+#endif
+#elif defined _MSC_VER && !defined RC_INVOKED
+#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
+#else
+// thread-unsafe branch
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif
+#else  // NCNN_THREADS
+static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
+{
+    int tmp = *addr;
+    *addr += delta;
+    return tmp;
+}
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT Allocator
+{
+public:
+    virtual ~Allocator();
+    virtual void* fastMalloc(size_t size) = 0;
+    virtual void fastFree(void* ptr) = 0;
+};
+
+class PoolAllocatorPrivate;
+class NCNN_EXPORT PoolAllocator : public Allocator
+{
+public:
+    PoolAllocator();
+    ~PoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0.75
+    void set_size_compare_ratio(float scr);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    PoolAllocator(const PoolAllocator&);
+    PoolAllocator& operator=(const PoolAllocator&);
+
+private:
+    PoolAllocatorPrivate* const d;
+};
+
+class UnlockedPoolAllocatorPrivate;
+class NCNN_EXPORT UnlockedPoolAllocator : public Allocator
+{
+public:
+    UnlockedPoolAllocator();
+    ~UnlockedPoolAllocator();
+
+    // ratio range 0 ~ 1
+    // default cr = 0.75
+    void set_size_compare_ratio(float scr);
+
+    // release all budgets immediately
+    void clear();
+
+    virtual void* fastMalloc(size_t size);
+    virtual void fastFree(void* ptr);
+
+private:
+    UnlockedPoolAllocator(const UnlockedPoolAllocator&);
+    UnlockedPoolAllocator& operator=(const UnlockedPoolAllocator&);
+
+private:
+    UnlockedPoolAllocatorPrivate* const d;
+};
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+
+class NCNN_EXPORT VkBufferMemory
+{
+public:
+    VkBuffer buffer;
+
+    // the base offset assigned by allocator
+    size_t offset;
+    size_t capacity;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // buffer state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkImageMemory
+{
+public:
+    VkImage image;
+    VkImageView imageview;
+
+    // underlying info assigned by allocator
+    int width;
+    int height;
+    int depth;
+    VkFormat format;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // the base offset assigned by allocator
+    size_t bind_offset;
+    size_t bind_capacity;
+
+    // image state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkImageLayout image_layout;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // in-execution state, modified by command functions internally
+    mutable int command_refcount;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class NCNN_EXPORT VkAllocator
+{
+public:
+    explicit VkAllocator(const VulkanDevice* _vkdev);
+    virtual ~VkAllocator();
+
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size) = 0;
+    virtual void fastFree(VkBufferMemory* ptr) = 0;
+    virtual int flush(VkBufferMemory* ptr);
+    virtual int invalidate(VkBufferMemory* ptr);
+
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack) = 0;
+    virtual void fastFree(VkImageMemory* ptr) = 0;
+
+public:
+    const VulkanDevice* vkdev;
+    uint32_t buffer_memory_type_index;
+    uint32_t image_memory_type_index;
+    uint32_t reserved_type_index;
+    bool mappable;
+    bool coherent;
+
+protected:
+    VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage);
+    VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index);
+    VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer);
+
+    VkImage create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage);
+    VkImageView create_imageview(VkImage image, VkFormat format);
+};
+
+class VkBlobAllocatorPrivate;
+class NCNN_EXPORT VkBlobAllocator : public VkAllocator
+{
+public:
+    explicit VkBlobAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 16 * 1024 * 1024); // 16M
+    virtual ~VkBlobAllocator();
+
+public:
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkBlobAllocator(const VkBlobAllocator&);
+    VkBlobAllocator& operator=(const VkBlobAllocator&);
+
+private:
+    VkBlobAllocatorPrivate* const d;
+};
+
+class VkWeightAllocatorPrivate;
+class NCNN_EXPORT VkWeightAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 8 * 1024 * 1024); // 8M
+    virtual ~VkWeightAllocator();
+
+public:
+    // release all blocks immediately
+    virtual void clear();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightAllocator(const VkWeightAllocator&);
+    VkWeightAllocator& operator=(const VkWeightAllocator&);
+
+private:
+    VkWeightAllocatorPrivate* const d;
+};
+
+class VkStagingAllocatorPrivate;
+class NCNN_EXPORT VkStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkStagingAllocator();
+
+public:
+    // ratio range 0 ~ 1
+    // default cr = 0.75
+    void set_size_compare_ratio(float scr);
+
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkStagingAllocator(const VkStagingAllocator&);
+    VkStagingAllocator& operator=(const VkStagingAllocator&);
+
+private:
+    VkStagingAllocatorPrivate* const d;
+};
+
+class VkWeightStagingAllocatorPrivate;
+class NCNN_EXPORT VkWeightStagingAllocator : public VkAllocator
+{
+public:
+    explicit VkWeightStagingAllocator(const VulkanDevice* vkdev);
+    virtual ~VkWeightStagingAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkWeightStagingAllocator(const VkWeightStagingAllocator&);
+    VkWeightStagingAllocator& operator=(const VkWeightStagingAllocator&);
+
+private:
+    VkWeightStagingAllocatorPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class NCNN_EXPORT VkAndroidHardwareBufferImageAllocator : public VkAllocator
+{
+public:
+    VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb);
+    virtual ~VkAndroidHardwareBufferImageAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
+    virtual void fastFree(VkImageMemory* ptr);
+
+private:
+    VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&);
+    VkAndroidHardwareBufferImageAllocator& operator=(const VkAndroidHardwareBufferImageAllocator&);
+
+public:
+    int init();
+
+    int width() const;
+    int height() const;
+    uint64_t external_format() const;
+
+public:
+    AHardwareBuffer* hb;
+    AHardwareBuffer_Desc bufferDesc;
+    VkAndroidHardwareBufferFormatPropertiesANDROID bufferFormatProperties;
+    VkAndroidHardwareBufferPropertiesANDROID bufferProperties;
+    VkSamplerYcbcrConversionKHR samplerYcbcrConversion;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_ALLOCATOR_H
--- a/lib_ncnn/ncnn/benchmark.h
+++ b/lib_ncnn/ncnn/benchmark.h
@@ -0,0 +1,36 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BENCHMARK_H
+#define NCNN_BENCHMARK_H
+
+#include "layer.h"
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+// get now timestamp in ms
+NCNN_EXPORT double get_current_time();
+
+#if NCNN_BENCHMARK
+
+NCNN_EXPORT void benchmark(const Layer* layer, double start, double end);
+NCNN_EXPORT void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end);
+
+#endif // NCNN_BENCHMARK
+
+} // namespace ncnn
+
+#endif // NCNN_BENCHMARK_H
--- a/lib_ncnn/ncnn/blob.h
+++ b/lib_ncnn/ncnn/blob.h
@@ -0,0 +1,44 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_BLOB_H
+#define NCNN_BLOB_H
+
+#include "mat.h"
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT Blob
+{
+public:
+    // empty
+    Blob();
+
+public:
+#if NCNN_STRING
+    // blob name
+    std::string name;
+#endif // NCNN_STRING
+    // layer index which produce this blob as output
+    int producer;
+    // layer index which need this blob as input
+    int consumer;
+    // shape hint
+    Mat shape;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_BLOB_H
--- a/lib_ncnn/ncnn/c_api.h
+++ b/lib_ncnn/ncnn/c_api.h
@@ -0,0 +1,320 @@
+/* Tencent is pleased to support the open source community by making ncnn available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+ * in compliance with the License. You may obtain a copy of the License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed
+ * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+ * CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
+ */
+
+#ifndef NCNN_C_API_H
+#define NCNN_C_API_H
+
+#include "platform.h"
+
+#if NCNN_C_API
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT const char* ncnn_version();
+
+/* allocator api */
+typedef struct __ncnn_allocator_t* ncnn_allocator_t;
+struct NCNN_EXPORT __ncnn_allocator_t
+{
+    void* pthis;
+
+    void* (*fast_malloc)(ncnn_allocator_t allocator, size_t size);
+    void (*fast_free)(ncnn_allocator_t allocator, void* ptr);
+};
+
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_pool_allocator();
+NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_unlocked_pool_allocator();
+NCNN_EXPORT void ncnn_allocator_destroy(ncnn_allocator_t allocator);
+
+/* option api */
+typedef struct __ncnn_option_t* ncnn_option_t;
+
+NCNN_EXPORT ncnn_option_t ncnn_option_create();
+NCNN_EXPORT void ncnn_option_destroy(ncnn_option_t opt);
+
+NCNN_EXPORT int ncnn_option_get_num_threads(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads);
+
+NCNN_EXPORT int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_option_set_use_vulkan_compute(ncnn_option_t opt, int use_vulkan_compute);
+
+/* mat api */
+typedef struct __ncnn_mat_t* ncnn_mat_t;
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create();
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d(int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d(int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d(int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d(int w, int h, int d, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d(int w, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d(int w, int h, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d(int w, int h, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d(int w, int h, int d, int c, void* data, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d_elem(int w, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d_elem(int w, int h, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d_elem(int w, int h, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d_elem(int w, int h, int d, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d_elem(int w, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d_elem(int w, int h, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d_elem(int w, int h, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d_elem(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_destroy(ncnn_mat_t mat);
+
+NCNN_EXPORT void ncnn_mat_fill_float(ncnn_mat_t mat, float v);
+
+NCNN_EXPORT ncnn_mat_t ncnn_mat_clone(const ncnn_mat_t mat, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_1d(const ncnn_mat_t mat, int w, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_2d(const ncnn_mat_t mat, int w, int h, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_3d(const ncnn_mat_t mat, int w, int h, int c, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_4d(const ncnn_mat_t mat, int w, int h, int d, int c, ncnn_allocator_t allocator);
+
+NCNN_EXPORT int ncnn_mat_get_dims(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_w(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_h(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_d(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_c(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_elemsize(const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_mat_get_elempack(const ncnn_mat_t mat);
+NCNN_EXPORT size_t ncnn_mat_get_cstep(const ncnn_mat_t mat);
+NCNN_EXPORT void* ncnn_mat_get_data(const ncnn_mat_t mat);
+
+NCNN_EXPORT void* ncnn_mat_get_channel_data(const ncnn_mat_t mat, int c);
+
+#if NCNN_PIXEL
+
+/* mat pixel api */
+#define NCNN_MAT_PIXEL_RGB       1
+#define NCNN_MAT_PIXEL_BGR       2
+#define NCNN_MAT_PIXEL_GRAY      3
+#define NCNN_MAT_PIXEL_RGBA      4
+#define NCNN_MAT_PIXEL_BGRA      5
+#define NCNN_MAT_PIXEL_X2Y(X, Y) (X | (Y << 16))
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, ncnn_allocator_t allocator);
+NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, ncnn_allocator_t allocator);
+NCNN_EXPORT void ncnn_mat_to_pixels(const ncnn_mat_t mat, unsigned char* pixels, int type, int stride);
+NCNN_EXPORT void ncnn_mat_to_pixels_resize(const ncnn_mat_t mat, unsigned char* pixels, int type, int target_width, int target_height, int target_stride);
+
+#endif /* NCNN_PIXEL */
+
+NCNN_EXPORT void ncnn_mat_substract_mean_normalize(ncnn_mat_t mat, const float* mean_vals, const float* norm_vals);
+
+NCNN_EXPORT void ncnn_convert_packing(const ncnn_mat_t src, ncnn_mat_t* dst, int elempack, const ncnn_option_t opt);
+NCNN_EXPORT void ncnn_flatten(const ncnn_mat_t src, ncnn_mat_t* dst, const ncnn_option_t opt);
+
+/* blob api */
+typedef struct __ncnn_blob_t* ncnn_blob_t;
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_blob_get_name(const ncnn_blob_t blob);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_blob_get_producer(const ncnn_blob_t blob);
+NCNN_EXPORT int ncnn_blob_get_consumer(const ncnn_blob_t blob);
+
+NCNN_EXPORT void ncnn_blob_get_shape(const ncnn_blob_t blob, int* dims, int* w, int* h, int* c);
+
+/* paramdict api */
+typedef struct __ncnn_paramdict_t* ncnn_paramdict_t;
+
+NCNN_EXPORT ncnn_paramdict_t ncnn_paramdict_create();
+NCNN_EXPORT void ncnn_paramdict_destroy(ncnn_paramdict_t pd);
+
+NCNN_EXPORT int ncnn_paramdict_get_type(const ncnn_paramdict_t pd, int id);
+
+NCNN_EXPORT int ncnn_paramdict_get_int(const ncnn_paramdict_t pd, int id, int def);
+NCNN_EXPORT float ncnn_paramdict_get_float(const ncnn_paramdict_t pd, int id, float def);
+NCNN_EXPORT ncnn_mat_t ncnn_paramdict_get_array(const ncnn_paramdict_t pd, int id, const ncnn_mat_t def);
+
+NCNN_EXPORT void ncnn_paramdict_set_int(ncnn_paramdict_t pd, int id, int i);
+NCNN_EXPORT void ncnn_paramdict_set_float(ncnn_paramdict_t pd, int id, float f);
+NCNN_EXPORT void ncnn_paramdict_set_array(ncnn_paramdict_t pd, int id, const ncnn_mat_t v);
+
+/* datareader api */
+typedef struct __ncnn_datareader_t* ncnn_datareader_t;
+struct NCNN_EXPORT __ncnn_datareader_t
+{
+    void* pthis;
+
+#if NCNN_STRING
+    int (*scan)(ncnn_datareader_t dr, const char* format, void* p);
+#endif /* NCNN_STRING */
+    size_t (*read)(ncnn_datareader_t dr, void* buf, size_t size);
+};
+
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create();
+#if NCNN_STDIO
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_stdio(FILE* fp);
+#endif /* NCNN_STDIO */
+NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_memory(const unsigned char** mem);
+NCNN_EXPORT void ncnn_datareader_destroy(ncnn_datareader_t dr);
+
+/* modelbin api */
+typedef struct __ncnn_modelbin_t* ncnn_modelbin_t;
+struct NCNN_EXPORT __ncnn_modelbin_t
+{
+    void* pthis;
+
+    ncnn_mat_t (*load_1d)(const ncnn_modelbin_t mb, int w, int type);
+    ncnn_mat_t (*load_2d)(const ncnn_modelbin_t mb, int w, int h, int type);
+    ncnn_mat_t (*load_3d)(const ncnn_modelbin_t mb, int w, int h, int c, int type);
+};
+
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_datareader(const ncnn_datareader_t dr);
+NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_mat_array(const ncnn_mat_t* weights, int n);
+NCNN_EXPORT void ncnn_modelbin_destroy(ncnn_modelbin_t mb);
+
+/* layer api */
+typedef struct __ncnn_layer_t* ncnn_layer_t;
+struct NCNN_EXPORT __ncnn_layer_t
+{
+    void* pthis;
+
+    int (*load_param)(ncnn_layer_t layer, const ncnn_paramdict_t pd);
+    int (*load_model)(ncnn_layer_t layer, const ncnn_modelbin_t mb);
+
+    int (*create_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+    int (*destroy_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
+
+    int (*forward_1)(const ncnn_layer_t layer, const ncnn_mat_t bottom_blob, ncnn_mat_t* top_blob, const ncnn_option_t opt);
+    int (*forward_n)(const ncnn_layer_t layer, const ncnn_mat_t* bottom_blobs, int n, ncnn_mat_t* top_blobs, int n2, const ncnn_option_t opt);
+
+    int (*forward_inplace_1)(const ncnn_layer_t layer, ncnn_mat_t bottom_top_blob, const ncnn_option_t opt);
+    int (*forward_inplace_n)(const ncnn_layer_t layer, ncnn_mat_t* bottom_top_blobs, int n, const ncnn_option_t opt);
+};
+
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create();
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex);
+#if NCNN_STRING
+NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_type(const char* type);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_layer_destroy(ncnn_layer_t layer);
+
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_name(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_typeindex(const ncnn_layer_t layer);
+#if NCNN_STRING
+NCNN_EXPORT const char* ncnn_layer_get_type(const ncnn_layer_t layer);
+#endif /* NCNN_STRING */
+
+NCNN_EXPORT int ncnn_layer_get_one_blob_only(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_inplace(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_vulkan(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_packing(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_bf16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_fp16_storage(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_support_image_storage(const ncnn_layer_t layer);
+
+NCNN_EXPORT void ncnn_layer_set_one_blob_only(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_inplace(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_vulkan(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_packing(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_bf16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_fp16_storage(ncnn_layer_t layer, int enable);
+NCNN_EXPORT void ncnn_layer_set_support_image_storage(ncnn_layer_t layer, int enable);
+
+NCNN_EXPORT int ncnn_layer_get_bottom_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_bottom(const ncnn_layer_t layer, int i);
+NCNN_EXPORT int ncnn_layer_get_top_count(const ncnn_layer_t layer);
+NCNN_EXPORT int ncnn_layer_get_top(const ncnn_layer_t layer, int i);
+
+NCNN_EXPORT void ncnn_blob_get_bottom_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+NCNN_EXPORT void ncnn_blob_get_top_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
+
+/* layer factory function */
+typedef ncnn_layer_t (*ncnn_layer_creator_t)(void* userdata);
+typedef void (*ncnn_layer_destroyer_t)(ncnn_layer_t layer, void* userdata);
+
+typedef struct __ncnn_net_custom_layer_factory_t* ncnn_net_custom_layer_factory_t;
+struct __ncnn_net_custom_layer_factory_t
+{
+    ncnn_layer_creator_t creator;
+    ncnn_layer_destroyer_t destroyer;
+    void* userdata;
+    ncnn_net_custom_layer_factory_t next;
+};
+
+/* net api */
+typedef struct __ncnn_net_t* ncnn_net_t;
+struct __ncnn_net_t
+{
+    void* pthis;
+
+    ncnn_net_custom_layer_factory_t custom_layer_factory;
+};
+
+NCNN_EXPORT ncnn_net_t ncnn_net_create();
+NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net);
+
+NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+#endif /* NCNN_STRING */
+NCNN_EXPORT void ncnn_net_register_custom_layer_by_typeindex(ncnn_net_t net, int typeindex, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param(ncnn_net_t net, const char* path);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin(ncnn_net_t net, const char* path);
+NCNN_EXPORT int ncnn_net_load_model(ncnn_net_t net, const char* path);
+#endif /* NCNN_STDIO */
+
+#if NCNN_STDIO
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_memory(ncnn_net_t net, const char* mem);
+#endif /* NCNN_STRING */
+#endif /* NCNN_STDIO */
+NCNN_EXPORT int ncnn_net_load_param_bin_memory(ncnn_net_t net, const unsigned char* mem);
+NCNN_EXPORT int ncnn_net_load_model_memory(ncnn_net_t net, const unsigned char* mem);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_net_load_param_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_net_load_param_bin_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+NCNN_EXPORT int ncnn_net_load_model_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
+
+NCNN_EXPORT void ncnn_net_clear(ncnn_net_t net);
+
+/* extractor api */
+typedef struct __ncnn_extractor_t* ncnn_extractor_t;
+
+NCNN_EXPORT ncnn_extractor_t ncnn_extractor_create(ncnn_net_t net);
+NCNN_EXPORT void ncnn_extractor_destroy(ncnn_extractor_t ex);
+
+NCNN_EXPORT void ncnn_extractor_set_option(ncnn_extractor_t ex, const ncnn_option_t opt);
+
+#if NCNN_STRING
+NCNN_EXPORT int ncnn_extractor_input(ncnn_extractor_t ex, const char* name, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract(ncnn_extractor_t ex, const char* name, ncnn_mat_t* mat);
+#endif /* NCNN_STRING */
+NCNN_EXPORT int ncnn_extractor_input_index(ncnn_extractor_t ex, int index, const ncnn_mat_t mat);
+NCNN_EXPORT int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* NCNN_C_API */
+
+#endif /* NCNN_C_API_H */
--- a/lib_ncnn/ncnn/command.h
+++ b/lib_ncnn/ncnn/command.h
@@ -0,0 +1,136 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_COMMAND_H
+#define NCNN_COMMAND_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+namespace ncnn {
+
+class Pipeline;
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class ImportAndroidHardwareBufferPipeline;
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+class VkComputePrivate;
+class NCNN_EXPORT VkCompute
+{
+public:
+    explicit VkCompute(const VulkanDevice* vkdev);
+    virtual ~VkCompute();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_download(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_download(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_buffer_to_image(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_image_to_buffer(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, Mat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkMat& src, VkImageMat& dst, const Option& opt);
+
+    void record_clone(const VkImageMat& src, VkMat& dst, const Option& opt);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkImageMat>& bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher);
+
+#if NCNN_BENCHMARK
+    void record_write_timestamp(uint32_t query);
+#endif // NCNN_BENCHMARK
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst);
+
+    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkImageMat& dst);
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+    int submit_and_wait();
+
+    int reset();
+
+#if NCNN_BENCHMARK
+    int create_query_pool(uint32_t query_count);
+
+    int get_query_pool_results(uint32_t first_query, uint32_t query_count, std::vector<uint64_t>& results);
+#endif // NCNN_BENCHMARK
+
+protected:
+    const VulkanDevice* vkdev;
+
+    void barrier_readwrite(const VkMat& binding);
+    void barrier_readwrite(const VkImageMat& binding);
+    void barrier_readonly(const VkImageMat& binding);
+
+private:
+    VkComputePrivate* const d;
+};
+
+class VkTransferPrivate;
+class NCNN_EXPORT VkTransfer
+{
+public:
+    explicit VkTransfer(const VulkanDevice* vkdev);
+    virtual ~VkTransfer();
+
+public:
+    void record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten = true);
+
+    void record_upload(const Mat& src, VkImageMat& dst, const Option& opt);
+
+    int submit_and_wait();
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    VkTransferPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_COMMAND_H
--- a/lib_ncnn/ncnn/cpu.h
+++ b/lib_ncnn/ncnn/cpu.h
@@ -0,0 +1,131 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_CPU_H
+#define NCNN_CPU_H
+
+#include <stddef.h>
+
+#if defined __ANDROID__ || defined __linux__
+#include <sched.h> // cpu_set_t
+#endif
+
+#include "platform.h"
+
+namespace ncnn {
+
+class NCNN_EXPORT CpuSet
+{
+public:
+    CpuSet();
+    void enable(int cpu);
+    void disable(int cpu);
+    void disable_all();
+    bool is_enabled(int cpu) const;
+    int num_enabled() const;
+
+public:
+#if defined __ANDROID__ || defined __linux__
+    cpu_set_t cpu_set;
+#endif
+#if __APPLE__
+    unsigned int policy;
+#endif
+};
+
+// test optional cpu features
+// neon = armv7 neon or aarch64 asimd
+NCNN_EXPORT int cpu_support_arm_neon();
+// vfpv4 = armv7 fp16 + fma
+NCNN_EXPORT int cpu_support_arm_vfpv4();
+// asimdhp = aarch64 asimd half precision
+NCNN_EXPORT int cpu_support_arm_asimdhp();
+// asimddp = aarch64 asimd dot product
+NCNN_EXPORT int cpu_support_arm_asimddp();
+
+// avx = x86 avx
+NCNN_EXPORT int cpu_support_x86_avx();
+// fma = x86 fma
+NCNN_EXPORT int cpu_support_x86_fma();
+// xop = x86 xop
+NCNN_EXPORT int cpu_support_x86_xop();
+// f16c = x86 f16c
+NCNN_EXPORT int cpu_support_x86_f16c();
+// avx2 = x86 avx2 + fma + f16c
+NCNN_EXPORT int cpu_support_x86_avx2();
+// avx_vnni = x86 avx vnni
+NCNN_EXPORT int cpu_support_x86_avx_vnni();
+// avx512 = x86 avx512f + avx512cd + avx512bw + avx512dq + avx512vl
+NCNN_EXPORT int cpu_support_x86_avx512();
+// avx512_vnni = x86 avx512 vnni
+NCNN_EXPORT int cpu_support_x86_avx512_vnni();
+
+// msa = mips mas
+NCNN_EXPORT int cpu_support_mips_msa();
+// mmi = loongson mmi
+NCNN_EXPORT int cpu_support_loongson_mmi();
+
+// v = riscv vector
+NCNN_EXPORT int cpu_support_riscv_v();
+// zfh = riscv half-precision float
+NCNN_EXPORT int cpu_support_riscv_zfh();
+// vlenb = riscv vector length in bytes
+NCNN_EXPORT int cpu_riscv_vlenb();
+
+// cpu info
+NCNN_EXPORT int get_cpu_count();
+NCNN_EXPORT int get_little_cpu_count();
+NCNN_EXPORT int get_big_cpu_count();
+
+// bind all threads on little clusters if powersave enabled
+// affects HMP arch cpu like ARM big.LITTLE
+// only implemented on android at the moment
+// switching powersave is expensive and not thread-safe
+// 0 = all cores enabled(default)
+// 1 = only little clusters enabled
+// 2 = only big clusters enabled
+// return 0 if success for setter function
+NCNN_EXPORT int get_cpu_powersave();
+NCNN_EXPORT int set_cpu_powersave(int powersave);
+
+// convenient wrapper
+NCNN_EXPORT const CpuSet& get_cpu_thread_affinity_mask(int powersave);
+
+// set explicit thread affinity
+NCNN_EXPORT int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask);
+
+// misc function wrapper for openmp routines
+NCNN_EXPORT int get_omp_num_threads();
+NCNN_EXPORT void set_omp_num_threads(int num_threads);
+
+NCNN_EXPORT int get_omp_dynamic();
+NCNN_EXPORT void set_omp_dynamic(int dynamic);
+
+NCNN_EXPORT int get_omp_thread_num();
+
+NCNN_EXPORT int get_kmp_blocktime();
+NCNN_EXPORT void set_kmp_blocktime(int time_ms);
+
+// need to flush denormals on Intel Chipset.
+// Other architectures such as ARM can be added as needed.
+// 0 = DAZ OFF, FTZ OFF
+// 1 = DAZ ON , FTZ OFF
+// 2 = DAZ OFF, FTZ ON
+// 3 = DAZ ON,  FTZ ON
+NCNN_EXPORT int get_flush_denormals();
+NCNN_EXPORT int set_flush_denormals(int flush_denormals);
+
+} // namespace ncnn
+
+#endif // NCNN_CPU_H
--- a/lib_ncnn/ncnn/datareader.h
+++ b/lib_ncnn/ncnn/datareader.h
@@ -0,0 +1,122 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_DATAREADER_H
+#define NCNN_DATAREADER_H
+
+#include "platform.h"
+#if NCNN_STDIO
+#include <stdio.h>
+#endif
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+// data read wrapper
+class NCNN_EXPORT DataReader
+{
+public:
+    DataReader();
+    virtual ~DataReader();
+
+#if NCNN_STRING
+    // parse plain param text
+    // return 1 if scan success
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+
+    // read binary param and model data
+    // return bytes read
+    virtual size_t read(void* buf, size_t size) const;
+
+    // get model data reference
+    // return bytes referenced
+    virtual size_t reference(size_t size, const void** buf) const;
+};
+
+#if NCNN_STDIO
+class DataReaderFromStdioPrivate;
+class NCNN_EXPORT DataReaderFromStdio : public DataReader
+{
+public:
+    explicit DataReaderFromStdio(FILE* fp);
+    virtual ~DataReaderFromStdio();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromStdio(const DataReaderFromStdio&);
+    DataReaderFromStdio& operator=(const DataReaderFromStdio&);
+
+private:
+    DataReaderFromStdioPrivate* const d;
+};
+#endif // NCNN_STDIO
+
+class DataReaderFromMemoryPrivate;
+class NCNN_EXPORT DataReaderFromMemory : public DataReader
+{
+public:
+    explicit DataReaderFromMemory(const unsigned char*& mem);
+    virtual ~DataReaderFromMemory();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+    virtual size_t reference(size_t size, const void** buf) const;
+
+private:
+    DataReaderFromMemory(const DataReaderFromMemory&);
+    DataReaderFromMemory& operator=(const DataReaderFromMemory&);
+
+private:
+    DataReaderFromMemoryPrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+class DataReaderFromAndroidAssetPrivate;
+class NCNN_EXPORT DataReaderFromAndroidAsset : public DataReader
+{
+public:
+    explicit DataReaderFromAndroidAsset(AAsset* asset);
+    virtual ~DataReaderFromAndroidAsset();
+
+#if NCNN_STRING
+    virtual int scan(const char* format, void* p) const;
+#endif // NCNN_STRING
+    virtual size_t read(void* buf, size_t size) const;
+
+private:
+    DataReaderFromAndroidAsset(const DataReaderFromAndroidAsset&);
+    DataReaderFromAndroidAsset& operator=(const DataReaderFromAndroidAsset&);
+
+private:
+    DataReaderFromAndroidAssetPrivate* const d;
+};
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+} // namespace ncnn
+
+#endif // NCNN_DATAREADER_H
--- a/lib_ncnn/ncnn/gpu.h
+++ b/lib_ncnn/ncnn/gpu.h
@@ -0,0 +1,359 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_GPU_H
+#define NCNN_GPU_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+
+#include "mat.h"
+
+#include <vulkan/vulkan.h>
+
+#include "vulkan_header_fix.h"
+
+namespace ncnn {
+
+// instance
+NCNN_EXPORT int create_gpu_instance();
+NCNN_EXPORT void destroy_gpu_instance();
+
+// instance extension capability
+extern int support_VK_KHR_external_memory_capabilities;
+extern int support_VK_KHR_get_physical_device_properties2;
+extern int support_VK_KHR_get_surface_capabilities2;
+extern int support_VK_KHR_surface;
+extern int support_VK_EXT_debug_utils;
+#if __ANDROID_API__ >= 26
+extern int support_VK_KHR_android_surface;
+#endif // __ANDROID_API__ >= 26
+
+// VK_KHR_external_memory_capabilities
+extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR;
+
+// VK_KHR_get_physical_device_properties2
+extern PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR;
+extern PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR;
+extern PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR;
+extern PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR;
+extern PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR;
+
+// VK_KHR_get_surface_capabilities2
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR;
+
+// VK_KHR_surface
+extern PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR;
+extern PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR;
+
+#if __ANDROID_API__ >= 26
+// VK_KHR_android_surface
+extern PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR;
+#endif // __ANDROID_API__ >= 26
+
+// VK_NV_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV vkGetPhysicalDeviceCooperativeMatrixPropertiesNV;
+
+// get info
+NCNN_EXPORT int get_gpu_count();
+NCNN_EXPORT int get_default_gpu_index();
+
+class GpuInfoPrivate;
+class NCNN_EXPORT GpuInfo
+{
+public:
+    explicit GpuInfo();
+    virtual ~GpuInfo();
+
+    // vulkan physical device
+    VkPhysicalDevice physical_device() const;
+
+    // memory properties
+    const VkPhysicalDeviceMemoryProperties& physical_device_memory_properties() const;
+
+    // info
+    uint32_t api_version() const;
+    uint32_t driver_version() const;
+    uint32_t vendor_id() const;
+    uint32_t device_id() const;
+    const char* device_name() const;
+    uint8_t* pipeline_cache_uuid() const;
+
+    // 0 = discrete gpu
+    // 1 = integrated gpu
+    // 2 = virtual gpu
+    // 3 = cpu
+    int type() const;
+
+    // hardware limit
+    uint32_t max_shared_memory_size() const;
+    uint32_t max_workgroup_count_x() const;
+    uint32_t max_workgroup_count_y() const;
+    uint32_t max_workgroup_count_z() const;
+    uint32_t max_workgroup_invocations() const;
+    uint32_t max_workgroup_size_x() const;
+    uint32_t max_workgroup_size_y() const;
+    uint32_t max_workgroup_size_z() const;
+    size_t memory_map_alignment() const;
+    size_t buffer_offset_alignment() const;
+    size_t non_coherent_atom_size() const;
+    size_t buffer_image_granularity() const;
+    uint32_t max_image_dimension_1d() const;
+    uint32_t max_image_dimension_2d() const;
+    uint32_t max_image_dimension_3d() const;
+    float timestamp_period() const;
+
+    // runtime
+    uint32_t compute_queue_family_index() const;
+    uint32_t graphics_queue_family_index() const;
+    uint32_t transfer_queue_family_index() const;
+
+    uint32_t compute_queue_count() const;
+    uint32_t graphics_queue_count() const;
+    uint32_t transfer_queue_count() const;
+
+    // property
+    bool unified_compute_transfer_queue() const;
+
+    // subgroup
+    uint32_t subgroup_size() const;
+    bool support_subgroup_basic() const;
+    bool support_subgroup_vote() const;
+    bool support_subgroup_ballot() const;
+    bool support_subgroup_shuffle() const;
+
+    // bug is not feature
+    bool bug_storage_buffer_no_l1() const;
+    bool bug_corrupted_online_pipeline_cache() const;
+    bool bug_buffer_image_load_zero() const;
+
+    // but sometimes bug is a feature
+    bool bug_implicit_fp16_arithmetic() const;
+
+    // fp16 and int8 feature
+    bool support_fp16_packed() const;
+    bool support_fp16_storage() const;
+    bool support_fp16_arithmetic() const;
+    bool support_int8_packed() const;
+    bool support_int8_storage() const;
+    bool support_int8_arithmetic() const;
+
+    // ycbcr conversion feature
+    bool support_ycbcr_conversion() const;
+
+    // cooperative matrix feature
+    bool support_cooperative_matrix() const;
+    bool support_cooperative_matrix_16_8_8() const;
+
+    // extension capability
+    int support_VK_KHR_8bit_storage() const;
+    int support_VK_KHR_16bit_storage() const;
+    int support_VK_KHR_bind_memory2() const;
+    int support_VK_KHR_create_renderpass2() const;
+    int support_VK_KHR_dedicated_allocation() const;
+    int support_VK_KHR_descriptor_update_template() const;
+    int support_VK_KHR_external_memory() const;
+    int support_VK_KHR_get_memory_requirements2() const;
+    int support_VK_KHR_maintenance1() const;
+    int support_VK_KHR_maintenance2() const;
+    int support_VK_KHR_maintenance3() const;
+    int support_VK_KHR_multiview() const;
+    int support_VK_KHR_push_descriptor() const;
+    int support_VK_KHR_sampler_ycbcr_conversion() const;
+    int support_VK_KHR_shader_float16_int8() const;
+    int support_VK_KHR_shader_float_controls() const;
+    int support_VK_KHR_storage_buffer_storage_class() const;
+    int support_VK_KHR_swapchain() const;
+    int support_VK_EXT_descriptor_indexing() const;
+    int support_VK_EXT_memory_budget() const;
+    int support_VK_EXT_queue_family_foreign() const;
+#if __ANDROID_API__ >= 26
+    int support_VK_ANDROID_external_memory_android_hardware_buffer() const;
+#endif // __ANDROID_API__ >= 26
+    int support_VK_NV_cooperative_matrix() const;
+
+private:
+    GpuInfo(const GpuInfo&);
+    GpuInfo& operator=(const GpuInfo&);
+
+private:
+    friend int create_gpu_instance();
+    GpuInfoPrivate* const d;
+};
+
+NCNN_EXPORT const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index());
+
+class VkAllocator;
+class VkCompute;
+class Option;
+class PipelineCache;
+class VulkanDevicePrivate;
+class NCNN_EXPORT VulkanDevice
+{
+public:
+    VulkanDevice(int device_index = get_default_gpu_index());
+    ~VulkanDevice();
+
+    const GpuInfo& info;
+
+    VkDevice vkdevice() const;
+
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;
+
+    // with fixed workgroup size
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+
+    // helper for creating pipeline
+    int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
+    int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
+    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, VkPipeline* pipeline) const;
+    int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+    uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
+    bool is_mappable(uint32_t memory_type_index) const;
+    bool is_coherent(uint32_t memory_type_index) const;
+
+    VkQueue acquire_queue(uint32_t queue_family_index) const;
+    void reclaim_queue(uint32_t queue_family_index, VkQueue queue) const;
+
+    // allocator on this device
+    VkAllocator* acquire_blob_allocator() const;
+    void reclaim_blob_allocator(VkAllocator* allocator) const;
+
+    VkAllocator* acquire_staging_allocator() const;
+    void reclaim_staging_allocator(VkAllocator* allocator) const;
+
+    // immutable sampler for texelfetch
+    const VkSampler* immutable_texelfetch_sampler() const;
+
+    // dummy buffer image
+    VkMat get_dummy_buffer() const;
+    VkImageMat get_dummy_image() const;
+    VkImageMat get_dummy_image_readonly() const;
+
+    // pipeline cache on this device
+    const PipelineCache* get_pipeline_cache() const;
+
+    // test image allocation
+    bool shape_support_image_storage(const Mat& shape) const;
+
+    // current gpu heap memory budget in MB
+    uint32_t get_heap_budget() const;
+
+    // utility operator
+    void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    void convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+
+    // VK_KHR_bind_memory2
+    PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR;
+    PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR;
+
+    // VK_KHR_create_renderpass2
+    PFN_vkCmdBeginRenderPass2KHR vkCmdBeginRenderPass2KHR;
+    PFN_vkCmdEndRenderPass2KHR vkCmdEndRenderPass2KHR;
+    PFN_vkCmdNextSubpass2KHR vkCmdNextSubpass2KHR;
+    PFN_vkCreateRenderPass2KHR vkCreateRenderPass2KHR;
+
+    // VK_KHR_descriptor_update_template
+    PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR;
+    PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR;
+    PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR;
+
+    // VK_KHR_get_memory_requirements2
+    PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR;
+    PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR;
+    PFN_vkGetImageSparseMemoryRequirements2KHR vkGetImageSparseMemoryRequirements2KHR;
+
+    // VK_KHR_maintenance1
+    PFN_vkTrimCommandPoolKHR vkTrimCommandPoolKHR;
+
+    // VK_KHR_maintenance3
+    PFN_vkGetDescriptorSetLayoutSupportKHR vkGetDescriptorSetLayoutSupportKHR;
+
+    // VK_KHR_push_descriptor
+    PFN_vkCmdPushDescriptorSetWithTemplateKHR vkCmdPushDescriptorSetWithTemplateKHR;
+    PFN_vkCmdPushDescriptorSetKHR vkCmdPushDescriptorSetKHR;
+
+    // VK_KHR_sampler_ycbcr_conversion
+    PFN_vkCreateSamplerYcbcrConversionKHR vkCreateSamplerYcbcrConversionKHR;
+    PFN_vkDestroySamplerYcbcrConversionKHR vkDestroySamplerYcbcrConversionKHR;
+
+    // VK_KHR_swapchain
+    PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR;
+    PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR;
+    PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR;
+    PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR;
+    PFN_vkQueuePresentKHR vkQueuePresentKHR;
+
+#if __ANDROID_API__ >= 26
+    // VK_ANDROID_external_memory_android_hardware_buffer
+    PFN_vkGetAndroidHardwareBufferPropertiesANDROID vkGetAndroidHardwareBufferPropertiesANDROID;
+    PFN_vkGetMemoryAndroidHardwareBufferANDROID vkGetMemoryAndroidHardwareBufferANDROID;
+#endif // __ANDROID_API__ >= 26
+
+protected:
+    // device extension
+    int init_device_extension();
+
+private:
+    VulkanDevice(const VulkanDevice&);
+    VulkanDevice& operator=(const VulkanDevice&);
+
+private:
+    VulkanDevicePrivate* const d;
+};
+
+NCNN_EXPORT VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index());
+
+// online spirv compilation
+NCNN_EXPORT int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv);
+NCNN_EXPORT int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv);
+
+// info from spirv
+class NCNN_EXPORT ShaderInfo
+{
+public:
+    int specialization_count;
+    int binding_count;
+    int push_constant_count;
+
+    // 0 = null
+    // 1 = storage buffer
+    // 2 = storage image
+    // 3 = combined image sampler
+    int binding_types[16]; // 16 is large enough I think ...
+
+    int reserved_0;
+    int reserved_1;
+    int reserved_2;
+    int reserved_3;
+};
+
+NCNN_EXPORT int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);
+
+} // namespace ncnn
+
+#endif // NCNN_VULKAN
+
+#endif // NCNN_GPU_H
--- a/lib_ncnn/ncnn/layer.h
+++ b/lib_ncnn/ncnn/layer.h
@@ -0,0 +1,215 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_H
+#define NCNN_LAYER_H
+
+#include "mat.h"
+#include "modelbin.h"
+#include "option.h"
+#include "paramdict.h"
+#include "platform.h"
+
+#include <math.h>
+
+#if NCNN_VULKAN
+#include "command.h"
+#include "pipeline.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+class NCNN_EXPORT Layer
+{
+public:
+    // empty
+    Layer();
+    // virtual destructor
+    virtual ~Layer();
+
+    // load layer specific parameter from parsed dict
+    // return 0 if success
+    virtual int load_param(const ParamDict& pd);
+
+    // load layer specific weight data from model binary
+    // return 0 if success
+    virtual int load_model(const ModelBin& mb);
+
+    // layer implementation specific setup
+    // return 0 if success
+    virtual int create_pipeline(const Option& opt);
+
+    // layer implementation specific clean
+    // return 0 if success
+    virtual int destroy_pipeline(const Option& opt);
+
+public:
+    // one input and one output blob
+    bool one_blob_only;
+
+    // support inplace inference
+    bool support_inplace;
+
+    // support vulkan compute
+    bool support_vulkan;
+
+    // accept input blob with packed storage
+    bool support_packing;
+
+    // accept bf16
+    bool support_bf16_storage;
+
+    // accept fp16
+    bool support_fp16_storage;
+
+    // accept int8
+    bool support_int8_storage;
+
+    // shader image storage
+    bool support_image_storage;
+
+    // shader tensor storage
+    bool support_tensor_storage;
+
+    bool support_reserved_00;
+
+    bool support_reserved_0;
+    bool support_reserved_1;
+    bool support_reserved_2;
+    bool support_reserved_3;
+    bool support_reserved_4;
+    bool support_reserved_5;
+    bool support_reserved_6;
+    bool support_reserved_7;
+    bool support_reserved_8;
+    bool support_reserved_9;
+    bool support_reserved_10;
+    bool support_reserved_11;
+    bool support_reserved_12;
+    bool support_reserved_13;
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+#if NCNN_VULKAN
+public:
+    // upload weight blob from host to device
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+public:
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inference
+    // return 0 if success
+    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+    // implement inplace inference
+    // return 0 if success
+    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    // assigned immediately after creating this layer
+    const VulkanDevice* vkdev;
+#endif // NCNN_VULKAN
+
+public:
+    // custom user data
+    void* userdata;
+    // layer type index
+    int typeindex;
+#if NCNN_STRING
+    // layer type name
+    std::string type;
+    // layer name
+    std::string name;
+#endif // NCNN_STRING
+    // blob index which this layer needs as input
+    std::vector<int> bottoms;
+    // blob index which this layer produces as output
+    std::vector<int> tops;
+    // shape hint
+    std::vector<Mat> bottom_shapes;
+    std::vector<Mat> top_shapes;
+};
+
+// layer factory function
+typedef Layer* (*layer_creator_func)(void*);
+typedef void (*layer_destroyer_func)(Layer*, void*);
+
+struct layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+};
+
+struct custom_layer_registry_entry
+{
+#if NCNN_STRING
+    // layer type name
+    const char* name;
+#endif // NCNN_STRING
+    // layer factory entry
+    layer_creator_func creator;
+    layer_destroyer_func destroyer;
+    void* userdata;
+};
+
+#if NCNN_STRING
+// get layer type from type name
+NCNN_EXPORT int layer_to_index(const char* type);
+// create layer from type name
+NCNN_EXPORT Layer* create_layer(const char* type);
+#endif // NCNN_STRING
+// create layer from layer type
+NCNN_EXPORT Layer* create_layer(int index);
+
+#define DEFINE_LAYER_CREATOR(name)                          \
+    ::ncnn::Layer* name##_layer_creator(void* /*userdata*/) \
+    {                                                       \
+        return new name;                                    \
+    }
+
+#define DEFINE_LAYER_DESTROYER(name)                                      \
+    void name##_layer_destroyer(::ncnn::Layer* layer, void* /*userdata*/) \
+    {                                                                     \
+        delete layer;                                                     \
+    }
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_H
--- a/lib_ncnn/ncnn/layer_shader_type.h
+++ b/lib_ncnn/ncnn/layer_shader_type.h
@@ -0,0 +1,29 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_SHADER_TYPE_H
+#define NCNN_LAYER_SHADER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerShaderType {
+enum LayerShaderType
+{
+#include "layer_shader_type_enum.h"
+};
+} // namespace LayerShaderType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_SHADER_TYPE_H
--- a/lib_ncnn/ncnn/layer_shader_type_enum.h
+++ b/lib_ncnn/ncnn/layer_shader_type_enum.h
@@ -0,0 +1,5 @@
+// Layer Shader Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+
--- a/lib_ncnn/ncnn/layer_type.h
+++ b/lib_ncnn/ncnn/layer_type.h
@@ -0,0 +1,30 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_LAYER_TYPE_H
+#define NCNN_LAYER_TYPE_H
+
+namespace ncnn {
+
+namespace LayerType {
+enum LayerType
+{
+#include "layer_type_enum.h"
+    CustomBit = (1 << 8),
+};
+} // namespace LayerType
+
+} // namespace ncnn
+
+#endif // NCNN_LAYER_TYPE_H
--- a/lib_ncnn/ncnn/layer_type_enum.h
+++ b/lib_ncnn/ncnn/layer_type_enum.h
@@ -0,0 +1,98 @@
+// Layer Type Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+AbsVal = 0,
+ArgMax = 1,
+BatchNorm = 2,
+Bias = 3,
+BNLL = 4,
+Concat = 5,
+Convolution = 6,
+Crop = 7,
+Deconvolution = 8,
+Dropout = 9,
+Eltwise = 10,
+ELU = 11,
+Embed = 12,
+Exp = 13,
+Flatten = 14,
+InnerProduct = 15,
+Input = 16,
+Log = 17,
+LRN = 18,
+MemoryData = 19,
+MVN = 20,
+Pooling = 21,
+Power = 22,
+PReLU = 23,
+Proposal = 24,
+Reduction = 25,
+ReLU = 26,
+Reshape = 27,
+ROIPooling = 28,
+Scale = 29,
+Sigmoid = 30,
+Slice = 31,
+Softmax = 32,
+Split = 33,
+SPP = 34,
+TanH = 35,
+Threshold = 36,
+Tile = 37,
+RNN = 38,
+LSTM = 39,
+BinaryOp = 40,
+UnaryOp = 41,
+ConvolutionDepthWise = 42,
+Padding = 43,
+Squeeze = 44,
+ExpandDims = 45,
+Normalize = 46,
+Permute = 47,
+PriorBox = 48,
+DetectionOutput = 49,
+Interp = 50,
+DeconvolutionDepthWise = 51,
+ShuffleChannel = 52,
+InstanceNorm = 53,
+Clip = 54,
+Reorg = 55,
+YoloDetectionOutput = 56,
+Quantize = 57,
+Dequantize = 58,
+Yolov3DetectionOutput = 59,
+PSROIPooling = 60,
+ROIAlign = 61,
+Packing = 62,
+Requantize = 63,
+Cast = 64,
+HardSigmoid = 65,
+SELU = 66,
+HardSwish = 67,
+Noop = 68,
+PixelShuffle = 69,
+DeepCopy = 70,
+Mish = 71,
+StatisticsPooling = 72,
+Swish = 73,
+Gemm = 74,
+GroupNorm = 75,
+LayerNorm = 76,
+Softplus = 77,
+GRU = 78,
+MultiHeadAttention = 79,
+GELU = 80,
+Convolution1D = 81,
+Pooling1D = 82,
+ConvolutionDepthWise1D = 83,
+Convolution3D = 84,
+ConvolutionDepthWise3D = 85,
+Pooling3D = 86,
+MatMul = 87,
+Deconvolution1D = 88,
+DeconvolutionDepthWise1D = 89,
+Deconvolution3D = 90,
+DeconvolutionDepthWise3D = 91,
+Einsum = 92,
+
--- a/lib_ncnn/ncnn/mat.h
+++ b/lib_ncnn/ncnn/mat.h
--- a/lib_ncnn/ncnn/modelbin.h
+++ b/lib_ncnn/ncnn/modelbin.h
@@ -0,0 +1,78 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_MODELBIN_H
+#define NCNN_MODELBIN_H
+
+#include "mat.h"
+
+namespace ncnn {
+
+class DataReader;
+class NCNN_EXPORT ModelBin
+{
+public:
+    ModelBin();
+    virtual ~ModelBin();
+    // element type
+    // 0 = auto
+    // 1 = float32
+    // 2 = float16
+    // 3 = int8
+    // load vec
+    virtual Mat load(int w, int type) const = 0;
+    // load image
+    virtual Mat load(int w, int h, int type) const;
+    // load dim
+    virtual Mat load(int w, int h, int c, int type) const;
+};
+
+class ModelBinFromDataReaderPrivate;
+class NCNN_EXPORT ModelBinFromDataReader : public ModelBin
+{
+public:
+    explicit ModelBinFromDataReader(const DataReader& dr);
+    virtual ~ModelBinFromDataReader();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromDataReader(const ModelBinFromDataReader&);
+    ModelBinFromDataReader& operator=(const ModelBinFromDataReader&);
+
+private:
+    ModelBinFromDataReaderPrivate* const d;
+};
+
+class ModelBinFromMatArrayPrivate;
+class NCNN_EXPORT ModelBinFromMatArray : public ModelBin
+{
+public:
+    // construct from weight blob array
+    explicit ModelBinFromMatArray(const Mat* weights);
+    virtual ~ModelBinFromMatArray();
+
+    virtual Mat load(int w, int type) const;
+
+private:
+    ModelBinFromMatArray(const ModelBinFromMatArray&);
+    ModelBinFromMatArray& operator=(const ModelBinFromMatArray&);
+
+private:
+    ModelBinFromMatArrayPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_MODELBIN_H
--- a/lib_ncnn/ncnn/ncnn_export.h
+++ b/lib_ncnn/ncnn/ncnn_export.h
@@ -0,0 +1,42 @@
+
+#ifndef NCNN_EXPORT_H
+#define NCNN_EXPORT_H
+
+#ifdef NCNN_STATIC_DEFINE
+#  define NCNN_EXPORT
+#  define NCNN_NO_EXPORT
+#else
+#  ifndef NCNN_EXPORT
+#    ifdef ncnn_EXPORTS
+        /* We are building this library */
+#      define NCNN_EXPORT 
+#    else
+        /* We are using this library */
+#      define NCNN_EXPORT 
+#    endif
+#  endif
+
+#  ifndef NCNN_NO_EXPORT
+#    define NCNN_NO_EXPORT 
+#  endif
+#endif
+
+#ifndef NCNN_DEPRECATED
+#  define NCNN_DEPRECATED __attribute__ ((__deprecated__))
+#endif
+
+#ifndef NCNN_DEPRECATED_EXPORT
+#  define NCNN_DEPRECATED_EXPORT NCNN_EXPORT NCNN_DEPRECATED
+#endif
+
+#ifndef NCNN_DEPRECATED_NO_EXPORT
+#  define NCNN_DEPRECATED_NO_EXPORT NCNN_NO_EXPORT NCNN_DEPRECATED
+#endif
+
+#if 0 /* DEFINE_NO_DEPRECATED */
+#  ifndef NCNN_NO_DEPRECATED
+#    define NCNN_NO_DEPRECATED
+#  endif
+#endif
+
+#endif /* NCNN_EXPORT_H */
--- a/lib_ncnn/ncnn/net.h
+++ b/lib_ncnn/ncnn/net.h
@@ -0,0 +1,272 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_NET_H
+#define NCNN_NET_H
+
+#include "blob.h"
+#include "layer.h"
+#include "mat.h"
+#include "option.h"
+#include "platform.h"
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#include <android/asset_manager.h>
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkCompute;
+#endif // NCNN_VULKAN
+class DataReader;
+class Extractor;
+class NetPrivate;
+class NCNN_EXPORT Net
+{
+public:
+    // empty init
+    Net();
+    // clear and destroy
+    virtual ~Net();
+
+public:
+    // option can be changed before loading
+    Option opt;
+
+#if NCNN_VULKAN
+    // set gpu device by index
+    void set_vulkan_device(int device_index);
+
+    // set gpu device by device handle, no owner transfer
+    void set_vulkan_device(const VulkanDevice* vkdev);
+
+    const VulkanDevice* vulkan_device() const;
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // register custom layer by layer type name
+    // return 0 if success
+    int register_custom_layer(const char* type, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+    virtual int custom_layer_to_index(const char* type);
+#endif // NCNN_STRING
+    // register custom layer by layer type
+    // return 0 if success
+    int register_custom_layer(int index, layer_creator_func creator, layer_destroyer_func destroyer = 0, void* userdata = 0);
+
+#if NCNN_STRING
+    int load_param(const DataReader& dr);
+#endif // NCNN_STRING
+
+    int load_param_bin(const DataReader& dr);
+
+    int load_model(const DataReader& dr);
+
+#if NCNN_STDIO
+#if NCNN_STRING
+    // load network structure from plain param file
+    // return 0 if success
+    int load_param(FILE* fp);
+    int load_param(const char* protopath);
+    int load_param_mem(const char* mem);
+#endif // NCNN_STRING
+    // load network structure from binary param file
+    // return 0 if success
+    int load_param_bin(FILE* fp);
+    int load_param_bin(const char* protopath);
+
+    // load network weight data from model file
+    // return 0 if success
+    int load_model(FILE* fp);
+    int load_model(const char* modelpath);
+#endif // NCNN_STDIO
+
+    // load network structure from external memory
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_param(const unsigned char* mem);
+
+    // reference network weight data from external memory
+    // weight data is not copied but referenced
+    // so external memory should be retained when used
+    // memory pointer must be 32-bit aligned
+    // return bytes consumed
+    int load_model(const unsigned char* mem);
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 9
+#if NCNN_STRING
+    // convenient load network structure from android asset plain param file
+    int load_param(AAsset* asset);
+    int load_param(AAssetManager* mgr, const char* assetpath);
+#endif // NCNN_STRING
+    // convenient load network structure from android asset binary param file
+    int load_param_bin(AAsset* asset);
+    int load_param_bin(AAssetManager* mgr, const char* assetpath);
+
+    // convenient load network weight data from android asset model file
+    int load_model(AAsset* asset);
+    int load_model(AAssetManager* mgr, const char* assetpath);
+#endif // __ANDROID_API__ >= 9
+#endif // NCNN_PLATFORM_API
+
+    // unload network structure and weight data
+    void clear();
+
+    // construct an Extractor from network
+    Extractor create_extractor() const;
+
+    // get input/output indexes/names
+    const std::vector<int>& input_indexes() const;
+    const std::vector<int>& output_indexes() const;
+#if NCNN_STRING
+    const std::vector<const char*>& input_names() const;
+    const std::vector<const char*>& output_names() const;
+#endif
+
+    const std::vector<Blob>& blobs() const;
+    const std::vector<Layer*>& layers() const;
+
+    std::vector<Blob>& mutable_blobs();
+    std::vector<Layer*>& mutable_layers();
+
+protected:
+    friend class Extractor;
+#if NCNN_STRING
+    int find_blob_index_by_name(const char* name) const;
+    int find_layer_index_by_name(const char* name) const;
+    virtual Layer* create_custom_layer(const char* type);
+#endif // NCNN_STRING
+    virtual Layer* create_custom_layer(int index);
+
+private:
+    Net(const Net&);
+    Net& operator=(const Net&);
+
+private:
+    NetPrivate* const d;
+};
+
+class ExtractorPrivate;
+class NCNN_EXPORT Extractor
+{
+public:
+    virtual ~Extractor();
+
+    // copy
+    Extractor(const Extractor&);
+
+    // assign
+    Extractor& operator=(const Extractor&);
+
+    // clear blob mats and alloctors
+    void clear();
+
+    // enable light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    void set_light_mode(bool enable);
+
+    // set thread count for this extractor
+    // this will overwrite the global setting
+    // default count is system depended
+    void set_num_threads(int num_threads);
+
+    // set blob memory allocator
+    void set_blob_allocator(Allocator* allocator);
+
+    // set workspace memory allocator
+    void set_workspace_allocator(Allocator* allocator);
+
+#if NCNN_VULKAN
+    void set_vulkan_compute(bool enable);
+
+    void set_blob_vkallocator(VkAllocator* allocator);
+
+    void set_workspace_vkallocator(VkAllocator* allocator);
+
+    void set_staging_vkallocator(VkAllocator* allocator);
+#endif // NCNN_VULKAN
+
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const Mat& in);
+
+    // get result by blob name
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(const char* blob_name, Mat& feat, int type = 0);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const Mat& in);
+
+    // get result by blob index
+    // return 0 if success
+    // type = 0, default
+    // type = 1, do not convert fp16/bf16 or / and packing
+    int extract(int blob_index, Mat& feat, int type = 0);
+
+#if NCNN_VULKAN
+#if NCNN_STRING
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob name
+    // return 0 if success
+    int input(const char* blob_name, const VkImageMat& in);
+
+    // get result by blob name
+    // return 0 if success
+    int extract(const char* blob_name, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_STRING
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkMat& feat, VkCompute& cmd);
+
+    // set input by blob index
+    // return 0 if success
+    int input(int blob_index, const VkImageMat& in);
+
+    // get result by blob index
+    // return 0 if success
+    int extract(int blob_index, VkImageMat& feat, VkCompute& cmd);
+#endif // NCNN_VULKAN
+
+protected:
+    friend Extractor Net::create_extractor() const;
+    Extractor(const Net* net, size_t blob_count);
+
+private:
+    ExtractorPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_NET_H
--- a/lib_ncnn/ncnn/option.h
+++ b/lib_ncnn/ncnn/option.h
@@ -0,0 +1,153 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_OPTION_H
+#define NCNN_OPTION_H
+
+#include "platform.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class VkAllocator;
+class PipelineCache;
+#endif // NCNN_VULKAN
+
+class Allocator;
+class NCNN_EXPORT Option
+{
+public:
+    // default option
+    Option();
+
+public:
+    // light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    bool lightmode;
+
+    // thread count
+    // default value is the one returned by get_cpu_count()
+    int num_threads;
+
+    // blob memory allocator
+    Allocator* blob_allocator;
+
+    // workspace memory allocator
+    Allocator* workspace_allocator;
+
+#if NCNN_VULKAN
+    // blob memory allocator
+    VkAllocator* blob_vkallocator;
+
+    // workspace memory allocator
+    VkAllocator* workspace_vkallocator;
+
+    // staging memory allocator
+    VkAllocator* staging_vkallocator;
+
+    // pipeline cache
+    PipelineCache* pipeline_cache;
+#endif // NCNN_VULKAN
+
+    // the time openmp threads busy-wait for more work before going to sleep
+    // default value is 20ms to keep the cores enabled
+    // without too much extra power consumption afterwards
+    int openmp_blocktime;
+
+    // enable winograd convolution optimization
+    // improve convolution 3x3 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_winograd_convolution;
+
+    // enable sgemm convolution optimization
+    // improve convolution 1x1 stride1 performance, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_sgemm_convolution;
+
+    // enable quantized int8 inference
+    // use low-precision int8 path for quantized model
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_int8_inference;
+
+    // enable vulkan compute
+    bool use_vulkan_compute;
+
+    // enable bf16 data type for storage
+    // improve most operator performance on all arm devices, may consume more memory
+    bool use_bf16_storage;
+
+    // enable options for gpu inference
+    bool use_fp16_packed;
+    bool use_fp16_storage;
+    bool use_fp16_arithmetic;
+    bool use_int8_packed;
+    bool use_int8_storage;
+    bool use_int8_arithmetic;
+
+    // enable simd-friendly packed memory layout
+    // improve all operator performance on all arm devices, will consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_packing_layout;
+
+    bool use_shader_pack8;
+
+    // subgroup option
+    bool use_subgroup_basic;
+    bool use_subgroup_vote;
+    bool use_subgroup_ballot;
+    bool use_subgroup_shuffle;
+
+    // turn on for adreno
+    bool use_image_storage;
+    bool use_tensor_storage;
+
+    bool use_reserved_0;
+
+    // enable DAZ(Denormals-Are-Zero) and FTZ(Flush-To-Zero)
+    // default value is 3
+    // 0 = DAZ OFF, FTZ OFF
+    // 1 = DAZ ON , FTZ OFF
+    // 2 = DAZ OFF, FTZ ON
+    // 3 = DAZ ON,  FTZ ON
+    int flush_denormals;
+
+    bool use_local_pool_allocator;
+
+    // enable local memory optimization for gpu inference
+    bool use_shader_local_memory;
+
+    // enable cooperative matrix optimization for gpu inference
+    bool use_cooperative_matrix;
+
+    // more fine-grained control of winograd convolution
+    bool use_winograd23_convolution;
+    bool use_winograd43_convolution;
+    bool use_winograd63_convolution;
+
+    bool use_reserved_6;
+    bool use_reserved_7;
+    bool use_reserved_8;
+    bool use_reserved_9;
+    bool use_reserved_10;
+    bool use_reserved_11;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_OPTION_H
--- a/lib_ncnn/ncnn/paramdict.h
+++ b/lib_ncnn/ncnn/paramdict.h
@@ -0,0 +1,73 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PARAMDICT_H
+#define NCNN_PARAMDICT_H
+
+#include "mat.h"
+
+// at most 32 parameters
+#define NCNN_MAX_PARAM_COUNT 32
+
+namespace ncnn {
+
+class DataReader;
+class Net;
+class ParamDictPrivate;
+class NCNN_EXPORT ParamDict
+{
+public:
+    // empty
+    ParamDict();
+
+    virtual ~ParamDict();
+
+    // copy
+    ParamDict(const ParamDict&);
+
+    // assign
+    ParamDict& operator=(const ParamDict&);
+
+    // get type
+    int type(int id) const;
+
+    // get int
+    int get(int id, int def) const;
+    // get float
+    float get(int id, float def) const;
+    // get array
+    Mat get(int id, const Mat& def) const;
+
+    // set int
+    void set(int id, int i);
+    // set float
+    void set(int id, float f);
+    // set array
+    void set(int id, const Mat& v);
+
+protected:
+    friend class Net;
+
+    void clear();
+
+    int load_param(const DataReader& dr);
+    int load_param_bin(const DataReader& dr);
+
+private:
+    ParamDictPrivate* const d;
+};
+
+} // namespace ncnn
+
+#endif // NCNN_PARAMDICT_H
--- a/lib_ncnn/ncnn/pipeline.h
+++ b/lib_ncnn/ncnn/pipeline.h
@@ -0,0 +1,113 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINE_H
+#define NCNN_PIPELINE_H
+
+#include "mat.h"
+#include "platform.h"
+#if NCNN_VULKAN
+#include "gpu.h"
+
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+class Option;
+class PipelinePrivate;
+class NCNN_EXPORT Pipeline
+{
+public:
+    explicit Pipeline(const VulkanDevice* vkdev);
+    virtual ~Pipeline();
+
+public:
+    void set_optimal_local_size_xyz(int w = 4, int h = 4, int c = 4);
+    void set_optimal_local_size_xyz(const Mat& local_size_xyz);
+    void set_local_size_xyz(int w, int h, int c);
+
+    int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations);
+
+    int create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations);
+
+public:
+    VkShaderModule shader_module() const;
+    VkDescriptorSetLayout descriptorset_layout() const;
+    VkPipelineLayout pipeline_layout() const;
+    VkPipeline pipeline() const;
+    VkDescriptorUpdateTemplateKHR descriptor_update_template() const;
+
+    const ShaderInfo& shader_info() const;
+
+    uint32_t local_size_x() const;
+    uint32_t local_size_y() const;
+    uint32_t local_size_z() const;
+
+protected:
+    void set_shader_module(VkShaderModule shader_module);
+    void set_descriptorset_layout(VkDescriptorSetLayout descriptorset_layout);
+    void set_pipeline_layout(VkPipelineLayout pipeline_layout);
+    void set_pipeline(VkPipeline pipeline);
+    void set_descriptor_update_template(VkDescriptorUpdateTemplateKHR descriptor_update_template);
+
+    void set_shader_info(const ShaderInfo& shader_info);
+
+public:
+    const VulkanDevice* vkdev;
+
+private:
+    Pipeline(const Pipeline&);
+    Pipeline& operator=(const Pipeline&);
+
+private:
+    PipelinePrivate* const d;
+};
+
+#if NCNN_PLATFORM_API
+#if __ANDROID_API__ >= 26
+class VkCompute;
+class NCNN_EXPORT ImportAndroidHardwareBufferPipeline : private Pipeline
+{
+public:
+    explicit ImportAndroidHardwareBufferPipeline(const VulkanDevice* vkdev);
+    virtual ~ImportAndroidHardwareBufferPipeline();
+
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, const Option& opt);
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, int target_width, int target_height, const Option& opt);
+    void destroy();
+
+    friend class VkCompute;
+
+protected:
+    int create_shader_module(const Option& opt);
+    int create_sampler(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator);
+    int create_descriptorset_layout();
+
+public:
+    int type_to;
+    int rotate_from;
+    bool need_resize;
+
+    VkSampler sampler;
+};
+#endif // __ANDROID_API__ >= 26
+#endif // NCNN_PLATFORM_API
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINE_H
--- a/lib_ncnn/ncnn/pipelinecache.h
+++ b/lib_ncnn/ncnn/pipelinecache.h
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PIPELINECACHE_H
+#define NCNN_PIPELINECACHE_H
+
+#include "platform.h"
+
+#if NCNN_VULKAN
+#include <vulkan/vulkan.h>
+#endif // NCNN_VULKAN
+
+#include "mat.h"
+#include "gpu.h"
+
+namespace ncnn {
+
+#if NCNN_VULKAN
+
+class VulkanDevice;
+class PipelineCachePrivate;
+class NCNN_EXPORT PipelineCache
+{
+public:
+    explicit PipelineCache(const VulkanDevice* _vkdev);
+
+    virtual ~PipelineCache();
+
+    void clear();
+
+    int get_pipeline(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+    int get_pipeline(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations,
+                     uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                     VkShaderModule* shader_module,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template,
+                     ShaderInfo& shader_info) const;
+
+protected:
+    int create_shader_module(int shader_type_index, const Option& opt, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z,
+                             VkShaderModule* _shader_module, ShaderInfo& si) const;
+
+    int new_pipeline(VkShaderModule shader_module, const ShaderInfo& shader_info, const std::vector<vk_specialization_type>& specializations,
+                     VkDescriptorSetLayout* descriptorset_layout,
+                     VkPipelineLayout* pipeline_layout,
+                     VkPipeline* pipeline,
+                     VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;
+
+protected:
+    const VulkanDevice* vkdev;
+
+private:
+    PipelineCache(const PipelineCache&);
+    PipelineCache& operator=(const PipelineCache&);
+
+private:
+    PipelineCachePrivate* const d;
+};
+
+#endif // NCNN_VULKAN
+
+} // namespace ncnn
+
+#endif // NCNN_PIPELINECACHE_H
--- a/lib_ncnn/ncnn/platform.h
+++ b/lib_ncnn/ncnn/platform.h
@@ -0,0 +1,273 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_PLATFORM_H
+#define NCNN_PLATFORM_H
+
+#define NCNN_STDIO 1
+#define NCNN_STRING 1
+#define NCNN_SIMPLEOCV 0
+#define NCNN_SIMPLEOMP 0
+#define NCNN_SIMPLESTL 0
+#define NCNN_THREADS 1
+#define NCNN_BENCHMARK 0
+#define NCNN_C_API 1
+#define NCNN_PLATFORM_API 1
+#define NCNN_PIXEL 1
+#define NCNN_PIXEL_ROTATE 1
+#define NCNN_PIXEL_AFFINE 1
+#define NCNN_PIXEL_DRAWING 1
+#define NCNN_VULKAN 0
+#define NCNN_SYSTEM_GLSLANG 0
+#define NCNN_RUNTIME_CPU 1
+#define NCNN_AVX 1
+#define NCNN_XOP 1
+#define NCNN_FMA 1
+#define NCNN_F16C 1
+#define NCNN_AVX2 1
+#define NCNN_AVXVNNI 0
+#define NCNN_AVX512 1
+#define NCNN_AVX512VNNI 1
+#if __aarch64__
+#define NCNN_ARM82 0
+#define NCNN_ARM82DOT 0
+#endif // __aarch64__
+#define NCNN_MSA 0
+#define NCNN_MMI 0
+#define NCNN_RVV 0
+#define NCNN_INT8 1
+#define NCNN_BF16 1
+#define NCNN_FORCE_INLINE 1
+
+#define NCNN_VERSION_STRING "1.0.20220617"
+
+#include "ncnn_export.h"
+
+#ifdef __cplusplus
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <process.h>
+#else
+#include <pthread.h>
+#endif
+#endif // NCNN_THREADS
+
+#if __ANDROID_API__ >= 26
+#define VK_USE_PLATFORM_ANDROID_KHR
+#endif // __ANDROID_API__ >= 26
+
+namespace ncnn {
+
+#if NCNN_THREADS
+#if (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { InitializeSRWLock(&srwlock); }
+    ~Mutex() {}
+    void lock() { AcquireSRWLockExclusive(&srwlock); }
+    void unlock() { ReleaseSRWLockExclusive(&srwlock); }
+private:
+    friend class ConditionVariable;
+    // NOTE SRWLock is available from windows vista
+    SRWLOCK srwlock;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { InitializeConditionVariable(&condvar); }
+    ~ConditionVariable() {}
+    void wait(Mutex& mutex) { SleepConditionVariableSRW(&condvar, &mutex.srwlock, INFINITE, 0); }
+    void broadcast() { WakeAllConditionVariable(&condvar); }
+    void signal() { WakeConditionVariable(&condvar); }
+private:
+    CONDITION_VARIABLE condvar;
+};
+
+static unsigned __stdcall start_wrapper(void* args);
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { _start = start; _args = args; handle = (HANDLE)_beginthreadex(0, 0, start_wrapper, this, 0, 0); }
+    ~Thread() {}
+    void join() { WaitForSingleObject(handle, INFINITE); CloseHandle(handle); }
+private:
+    friend unsigned __stdcall start_wrapper(void* args)
+    {
+        Thread* t = (Thread*)args;
+        t->_start(t->_args);
+        return 0;
+    }
+    HANDLE handle;
+    void* (*_start)(void*);
+    void* _args;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { key = TlsAlloc(); }
+    ~ThreadLocalStorage() { TlsFree(key); }
+    void set(void* value) { TlsSetValue(key, (LPVOID)value); }
+    void* get() { return (void*)TlsGetValue(key); }
+private:
+    DWORD key;
+};
+#else // (defined _WIN32 && !(defined __MINGW32__))
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() { pthread_mutex_init(&mutex, 0); }
+    ~Mutex() { pthread_mutex_destroy(&mutex); }
+    void lock() { pthread_mutex_lock(&mutex); }
+    void unlock() { pthread_mutex_unlock(&mutex); }
+private:
+    friend class ConditionVariable;
+    pthread_mutex_t mutex;
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() { pthread_cond_init(&cond, 0); }
+    ~ConditionVariable() { pthread_cond_destroy(&cond); }
+    void wait(Mutex& mutex) { pthread_cond_wait(&cond, &mutex.mutex); }
+    void broadcast() { pthread_cond_broadcast(&cond); }
+    void signal() { pthread_cond_signal(&cond); }
+private:
+    pthread_cond_t cond;
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { pthread_create(&t, 0, start, args); }
+    ~Thread() {}
+    void join() { pthread_join(t, 0); }
+private:
+    pthread_t t;
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { pthread_key_create(&key, 0); }
+    ~ThreadLocalStorage() { pthread_key_delete(key); }
+    void set(void* value) { pthread_setspecific(key, value); }
+    void* get() { return pthread_getspecific(key); }
+private:
+    pthread_key_t key;
+};
+#endif // (defined _WIN32 && !(defined __MINGW32__))
+#else // NCNN_THREADS
+class NCNN_EXPORT Mutex
+{
+public:
+    Mutex() {}
+    ~Mutex() {}
+    void lock() {}
+    void unlock() {}
+};
+
+class NCNN_EXPORT ConditionVariable
+{
+public:
+    ConditionVariable() {}
+    ~ConditionVariable() {}
+    void wait(Mutex& /*mutex*/) {}
+    void broadcast() {}
+    void signal() {}
+};
+
+class NCNN_EXPORT Thread
+{
+public:
+    Thread(void* (*/*start*/)(void*), void* /*args*/ = 0) {}
+    ~Thread() {}
+    void join() {}
+};
+
+class NCNN_EXPORT ThreadLocalStorage
+{
+public:
+    ThreadLocalStorage() { data = 0; }
+    ~ThreadLocalStorage() {}
+    void set(void* value) { data = value; }
+    void* get() { return data; }
+private:
+    void* data;
+};
+#endif // NCNN_THREADS
+
+class NCNN_EXPORT MutexLockGuard
+{
+public:
+    MutexLockGuard(Mutex& _mutex) : mutex(_mutex) { mutex.lock(); }
+    ~MutexLockGuard() { mutex.unlock(); }
+private:
+    Mutex& mutex;
+};
+
+} // namespace ncnn
+
+#if NCNN_SIMPLESTL
+#include "simplestl.h"
+#else
+#include <algorithm>
+#include <list>
+#include <vector>
+#include <string>
+#endif
+
+#endif // __cplusplus
+
+#if NCNN_STDIO
+#if NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <android/log.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); \
+    __android_log_print(ANDROID_LOG_WARN, "ncnn", ##__VA_ARGS__); } while(0)
+#else // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#include <stdio.h>
+#define NCNN_LOGE(...) do { \
+    fprintf(stderr, ##__VA_ARGS__); fprintf(stderr, "\n"); } while(0)
+#endif // NCNN_PLATFORM_API && __ANDROID_API__ >= 8
+#else
+#define NCNN_LOGE(...)
+#endif
+
+
+#if NCNN_FORCE_INLINE
+#ifdef _MSC_VER
+    #define NCNN_FORCEINLINE __forceinline
+#elif defined(__GNUC__)
+    #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+#elif defined(__CLANG__)
+    #if __has_attribute(__always_inline__)
+        #define NCNN_FORCEINLINE inline __attribute__((__always_inline__))
+    #else
+        #define NCNN_FORCEINLINE inline
+    #endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+#else
+    #define NCNN_FORCEINLINE inline
+#endif
+
+#endif // NCNN_PLATFORM_H
--- a/lib_ncnn/ncnn/simpleocv.h
+++ b/lib_ncnn/ncnn/simpleocv.h
@@ -0,0 +1,501 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOCV_H
+#define NCNN_SIMPLEOCV_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOCV
+
+#include <limits.h>
+#include <string.h>
+#include "allocator.h"
+#include "mat.h"
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma push_macro("min")
+#pragma push_macro("max")
+#undef min
+#undef max
+#endif
+
+#ifndef NCNN_XADD
+using ncnn::NCNN_XADD;
+#endif
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+
+enum
+{
+    CV_LOAD_IMAGE_UNCHANGED = -1,
+    CV_LOAD_IMAGE_GRAYSCALE = 0,
+    CV_LOAD_IMAGE_COLOR = 1,
+};
+
+enum
+{
+    CV_IMWRITE_JPEG_QUALITY = 1
+};
+
+// minimal opencv style data structure implementation
+namespace cv {
+
+template<typename _Tp>
+static inline _Tp saturate_cast(int v)
+{
+    return _Tp(v);
+}
+template<>
+inline uchar saturate_cast<uchar>(int v)
+{
+    return (uchar)((unsigned)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
+}
+
+template<typename _Tp>
+struct Scalar_
+{
+    Scalar_()
+    {
+        v[0] = 0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0)
+    {
+        v[0] = _v0;
+        v[1] = 0;
+        v[2] = 0;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = 0;
+    }
+    Scalar_(_Tp _v0, _Tp _v1, _Tp _v2, _Tp _v3)
+    {
+        v[0] = _v0;
+        v[1] = _v1;
+        v[2] = _v2;
+        v[3] = _v3;
+    }
+
+    const _Tp operator[](const int i) const
+    {
+        return v[i];
+    }
+
+    _Tp operator[](const int i)
+    {
+        return v[i];
+    }
+
+    _Tp v[4];
+};
+
+typedef Scalar_<uchar> Scalar;
+
+template<typename _Tp>
+struct Point_
+{
+    Point_()
+        : x(0), y(0)
+    {
+    }
+    Point_(_Tp _x, _Tp _y)
+        : x(_x), y(_y)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Point_<_Tp2>() const
+    {
+        return Point_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y));
+    }
+
+    _Tp x;
+    _Tp y;
+};
+
+typedef Point_<int> Point;
+typedef Point_<float> Point2f;
+
+template<typename _Tp>
+struct Size_
+{
+    Size_()
+        : width(0), height(0)
+    {
+    }
+    Size_(_Tp _w, _Tp _h)
+        : width(_w), height(_h)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Size_<_Tp2>() const
+    {
+        return Size_<_Tp2>(saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp width;
+    _Tp height;
+};
+
+typedef Size_<int> Size;
+typedef Size_<float> Size2f;
+
+template<typename _Tp>
+struct Rect_
+{
+    Rect_()
+        : x(0), y(0), width(0), height(0)
+    {
+    }
+    Rect_(_Tp _x, _Tp _y, _Tp _w, _Tp _h)
+        : x(_x), y(_y), width(_w), height(_h)
+    {
+    }
+    Rect_(Point_<_Tp> _p, Size_<_Tp> _size)
+        : x(_p.x), y(_p.y), width(_size.width), height(_size.height)
+    {
+    }
+
+    template<typename _Tp2>
+    operator Rect_<_Tp2>() const
+    {
+        return Rect_<_Tp2>(saturate_cast<_Tp2>(x), saturate_cast<_Tp2>(y), saturate_cast<_Tp2>(width), saturate_cast<_Tp2>(height));
+    }
+
+    _Tp x;
+    _Tp y;
+    _Tp width;
+    _Tp height;
+
+    // area
+    _Tp area() const
+    {
+        return width * height;
+    }
+};
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator&=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::max(a.x, b.x), y1 = std::max(a.y, b.y);
+    a.width = std::min(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::min(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    if (a.width <= 0 || a.height <= 0)
+        a = Rect_<_Tp>();
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp>& operator|=(Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    _Tp x1 = std::min(a.x, b.x), y1 = std::min(a.y, b.y);
+    a.width = std::max(a.x + a.width, b.x + b.width) - x1;
+    a.height = std::max(a.y + a.height, b.y + b.height) - y1;
+    a.x = x1;
+    a.y = y1;
+    return a;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator&(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c &= b;
+}
+
+template<typename _Tp>
+static inline Rect_<_Tp> operator|(const Rect_<_Tp>& a, const Rect_<_Tp>& b)
+{
+    Rect_<_Tp> c = a;
+    return c |= b;
+}
+
+typedef Rect_<int> Rect;
+typedef Rect_<float> Rect2f;
+
+#define CV_8UC1  1
+#define CV_8UC3  3
+#define CV_8UC4  4
+#define CV_32FC1 4
+
+struct NCNN_EXPORT Mat
+{
+    Mat()
+        : data(0), refcount(0), rows(0), cols(0), c(0)
+    {
+    }
+
+    Mat(int _rows, int _cols, int flags)
+        : data(0), refcount(0)
+    {
+        create(_rows, _cols, flags);
+    }
+
+    // copy
+    Mat(const Mat& m)
+        : data(m.data), refcount(m.refcount)
+    {
+        if (refcount)
+            NCNN_XADD(refcount, 1);
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+    }
+
+    Mat(int _rows, int _cols, int flags, void* _data)
+        : data((unsigned char*)_data), refcount(0)
+    {
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+    }
+
+    ~Mat()
+    {
+        release();
+    }
+
+    // assign
+    Mat& operator=(const Mat& m)
+    {
+        if (this == &m)
+            return *this;
+
+        if (m.refcount)
+            NCNN_XADD(m.refcount, 1);
+
+        release();
+
+        data = m.data;
+        refcount = m.refcount;
+
+        rows = m.rows;
+        cols = m.cols;
+        c = m.c;
+
+        return *this;
+    }
+
+    Mat& operator=(const Scalar& s)
+    {
+        if (total() > 0)
+        {
+            uchar* p = data;
+            for (int i = 0; i < cols * rows; i++)
+            {
+                for (int j = 0; j < c; j++)
+                {
+                    *p++ = s[j];
+                }
+            }
+        }
+
+        return *this;
+    }
+
+    void create(int _rows, int _cols, int flags)
+    {
+        release();
+
+        rows = _rows;
+        cols = _cols;
+        c = flags;
+
+        if (total() > 0)
+        {
+            // refcount address must be aligned, so we expand totalsize here
+            size_t totalsize = (total() + 3) >> 2 << 2;
+            data = (uchar*)ncnn::fastMalloc(totalsize + (int)sizeof(*refcount));
+            refcount = (int*)(((uchar*)data) + totalsize);
+            *refcount = 1;
+        }
+    }
+
+    void release()
+    {
+        if (refcount && NCNN_XADD(refcount, -1) == 1)
+            ncnn::fastFree(data);
+
+        data = 0;
+
+        rows = 0;
+        cols = 0;
+        c = 0;
+
+        refcount = 0;
+    }
+
+    Mat clone() const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(rows, cols, c);
+
+        if (total() > 0)
+        {
+            memcpy(m.data, data, total());
+        }
+
+        return m;
+    }
+
+    bool empty() const
+    {
+        return data == 0 || total() == 0;
+    }
+
+    int channels() const
+    {
+        return c;
+    }
+
+    int type() const
+    {
+        return c;
+    }
+
+    size_t total() const
+    {
+        return cols * rows * c;
+    }
+
+    const uchar* ptr(int y) const
+    {
+        return data + y * cols * c;
+    }
+
+    uchar* ptr(int y)
+    {
+        return data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    const _Tp* ptr(int y) const
+    {
+        return (const _Tp*)data + y * cols * c;
+    }
+
+    template<typename _Tp>
+    _Tp* ptr(int y)
+    {
+        return (_Tp*)data + y * cols * c;
+    }
+
+    // roi
+    Mat operator()(const Rect& roi) const
+    {
+        if (empty())
+            return Mat();
+
+        Mat m(roi.height, roi.width, c);
+
+        int sy = roi.y;
+        for (int y = 0; y < roi.height; y++)
+        {
+            const uchar* sptr = ptr(sy) + roi.x * c;
+            uchar* dptr = m.ptr(y);
+            memcpy(dptr, sptr, roi.width * c);
+            sy++;
+        }
+
+        return m;
+    }
+
+    uchar* data;
+
+    // pointer to the reference counter;
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    int rows;
+    int cols;
+
+    int c;
+};
+
+enum ImreadModes
+{
+    IMREAD_UNCHANGED = -1,
+    IMREAD_GRAYSCALE = 0,
+    IMREAD_COLOR = 1
+};
+
+NCNN_EXPORT Mat imread(const std::string& path, int flags = IMREAD_COLOR);
+
+enum ImwriteFlags
+{
+    IMWRITE_JPEG_QUALITY = 1
+};
+
+NCNN_EXPORT bool imwrite(const std::string& path, const Mat& m, const std::vector<int>& params = std::vector<int>());
+
+NCNN_EXPORT void imshow(const std::string& name, const Mat& m);
+
+NCNN_EXPORT int waitKey(int delay = 0);
+
+#if NCNN_PIXEL
+NCNN_EXPORT void resize(const Mat& src, Mat& dst, const Size& size, float sw = 0.f, float sh = 0.f, int flags = 0);
+#endif // NCNN_PIXEL
+
+#if NCNN_PIXEL_DRAWING
+
+enum
+{
+    FILLED = -1
+};
+
+NCNN_EXPORT void rectangle(Mat& img, Point pt1, Point pt2, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void rectangle(Mat& img, Rect rec, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void circle(Mat& img, Point center, int radius, const Scalar& color, int thickness = 1);
+
+NCNN_EXPORT void line(Mat& img, Point p0, Point p1, const Scalar& color, int thickness = 1);
+
+enum
+{
+    FONT_HERSHEY_SIMPLEX = 0
+};
+
+NCNN_EXPORT void putText(Mat& img, const std::string& text, Point org, int fontFace, double fontScale, Scalar color, int thickness = 1);
+
+NCNN_EXPORT Size getTextSize(const std::string& text, int fontFace, double fontScale, int thickness, int* baseLine);
+
+#endif // NCNN_PIXEL_DRAWING
+
+} // namespace cv
+
+#if defined(_MSC_VER) || defined(__GNUC__)
+#pragma pop_macro("min")
+#pragma pop_macro("max")
+#endif
+
+#endif // NCNN_SIMPLEOCV
+
+#endif // NCNN_SIMPLEOCV_H
--- a/lib_ncnn/ncnn/simpleomp.h
+++ b/lib_ncnn/ncnn/simpleomp.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEOMP_H
+#define NCNN_SIMPLEOMP_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEOMP
+
+#include <stdint.h>
+
+// This minimal openmp runtime implementation only supports the llvm openmp abi
+// and only supports #pragma omp parallel for num_threads(X)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+NCNN_EXPORT int omp_get_max_threads();
+
+NCNN_EXPORT void omp_set_num_threads(int num_threads);
+
+NCNN_EXPORT int omp_get_dynamic();
+
+NCNN_EXPORT void omp_set_dynamic(int dynamic);
+
+NCNN_EXPORT int omp_get_num_threads();
+
+NCNN_EXPORT int omp_get_thread_num();
+
+NCNN_EXPORT int kmp_get_blocktime();
+
+NCNN_EXPORT void kmp_set_blocktime(int blocktime);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // NCNN_SIMPLEOMP
+
+#endif // NCNN_SIMPLEOMP_H
--- a/lib_ncnn/ncnn/simplestl.h
+++ b/lib_ncnn/ncnn/simplestl.h
@@ -0,0 +1,565 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLESTL_H
+#define NCNN_SIMPLESTL_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#if !NCNN_SIMPLESTL
+
+#include <new>
+
+#else
+
+// allocation functions
+NCNN_EXPORT void* operator new(size_t size);
+NCNN_EXPORT void* operator new[](size_t size);
+// placement allocation functions
+NCNN_EXPORT void* operator new(size_t size, void* ptr);
+NCNN_EXPORT void* operator new[](size_t size, void* ptr);
+// deallocation functions
+NCNN_EXPORT void operator delete(void* ptr);
+NCNN_EXPORT void operator delete[](void* ptr);
+// deallocation functions since c++14
+#if __cplusplus >= 201402L
+NCNN_EXPORT void operator delete(void* ptr, size_t sz);
+NCNN_EXPORT void operator delete[](void* ptr, size_t sz);
+#endif
+// placement deallocation functions
+NCNN_EXPORT void operator delete(void* ptr, void* voidptr2);
+NCNN_EXPORT void operator delete[](void* ptr, void* voidptr2);
+
+#endif
+
+// minimal stl data structure implementation
+namespace std {
+
+template<typename T>
+const T& max(const T& a, const T& b)
+{
+    return (a < b) ? b : a;
+}
+
+template<typename T>
+const T& min(const T& a, const T& b)
+{
+    return (a > b) ? b : a;
+}
+
+template<typename T>
+void swap(T& a, T& b)
+{
+    T temp(a);
+    a = b;
+    b = temp;
+}
+
+template<typename T1, typename T2>
+struct pair
+{
+    pair()
+        : first(), second()
+    {
+    }
+    pair(const T1& t1, const T2& t2)
+        : first(t1), second(t2)
+    {
+    }
+
+    T1 first;
+    T2 second;
+};
+
+template<typename T1, typename T2>
+bool operator==(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return (x.first == y.first && x.second == y.second);
+}
+template<typename T1, typename T2>
+bool operator<(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template<typename T1, typename T2>
+bool operator!=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x == y);
+}
+template<typename T1, typename T2>
+bool operator>(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return y < x;
+}
+template<typename T1, typename T2>
+bool operator<=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(y < x);
+}
+template<typename T1, typename T2>
+bool operator>=(const pair<T1, T2>& x, const pair<T1, T2>& y)
+{
+    return !(x < y);
+}
+
+template<typename T1, typename T2>
+pair<T1, T2> make_pair(const T1& t1, const T2& t2)
+{
+    return pair<T1, T2>(t1, t2);
+}
+
+template<typename T>
+struct node
+{
+    node* prev_;
+    node* next_;
+    T data_;
+
+    node()
+        : prev_(0), next_(0), data_()
+    {
+    }
+    node(const T& t)
+        : prev_(0), next_(0), data_(t)
+    {
+    }
+};
+
+template<typename T>
+struct iter_list
+{
+    iter_list()
+        : curr_(0)
+    {
+    }
+    iter_list(node<T>* n)
+        : curr_(n)
+    {
+    }
+    iter_list(const iter_list& i)
+        : curr_(i.curr_)
+    {
+    }
+    ~iter_list()
+    {
+    }
+
+    iter_list& operator=(const iter_list& i)
+    {
+        curr_ = i.curr_;
+        return *this;
+    }
+
+    T& operator*()
+    {
+        return curr_->data_;
+    }
+    T* operator->()
+    {
+        return &(curr_->data_);
+    }
+
+    bool operator==(const iter_list& i)
+    {
+        return curr_ == i.curr_;
+    }
+    bool operator!=(const iter_list& i)
+    {
+        return curr_ != i.curr_;
+    }
+
+    iter_list& operator++()
+    {
+        curr_ = curr_->next_;
+        return *this;
+    }
+    iter_list& operator--()
+    {
+        curr_ = curr_->prev_;
+        return *this;
+    }
+
+    node<T>* curr_;
+};
+
+template<typename T>
+struct list
+{
+    typedef iter_list<T> iterator;
+
+    list()
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+    }
+    ~list()
+    {
+        clear();
+        delete head_;
+    }
+    list(const list& l)
+    {
+        head_ = new node<T>();
+        tail_ = head_;
+        count_ = 0;
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+    }
+
+    list& operator=(const list& l)
+    {
+        if (this == &l)
+        {
+            return *this;
+        }
+        clear();
+
+        for (iter_list<T> i = l.begin(); i != l.end(); ++i)
+        {
+            push_back(*i);
+        }
+        return *this;
+    }
+
+    void clear()
+    {
+        while (count_ > 0)
+        {
+            pop_front();
+        }
+    }
+
+    void pop_front()
+    {
+        if (count_ > 0)
+        {
+            head_ = head_->next_;
+            delete head_->prev_;
+            head_->prev_ = 0;
+            --count_;
+        }
+    }
+
+    size_t size() const
+    {
+        return count_;
+    }
+    iter_list<T> begin() const
+    {
+        return iter_list<T>(head_);
+    }
+    iter_list<T> end() const
+    {
+        return iter_list<T>(tail_);
+    }
+    bool empty() const
+    {
+        return count_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        if (count_ == 0)
+        {
+            head_ = new node<T>(t);
+            head_->prev_ = 0;
+            head_->next_ = tail_;
+            tail_->prev_ = head_;
+            count_ = 1;
+        }
+        else
+        {
+            node<T>* temp = new node<T>(t);
+            temp->prev_ = tail_->prev_;
+            temp->next_ = tail_;
+            tail_->prev_->next_ = temp;
+            tail_->prev_ = temp;
+            ++count_;
+        }
+    }
+
+    iter_list<T> erase(iter_list<T> pos)
+    {
+        if (pos != end())
+        {
+            node<T>* temp = pos.curr_;
+            if (temp == head_)
+            {
+                ++pos;
+                temp->next_->prev_ = 0;
+                head_ = temp->next_;
+            }
+            else
+            {
+                --pos;
+                temp->next_->prev_ = temp->prev_;
+                temp->prev_->next_ = temp->next_;
+                ++pos;
+            }
+            delete temp;
+            --count_;
+        }
+        return pos;
+    }
+
+protected:
+    node<T>* head_;
+    node<T>* tail_;
+    size_t count_;
+};
+
+template<typename T>
+struct greater
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x > y);
+    }
+};
+
+template<typename T>
+struct less
+{
+    bool operator()(const T& x, const T& y) const
+    {
+        return (x < y);
+    }
+};
+
+template<typename RandomAccessIter, typename Compare>
+void partial_sort(RandomAccessIter first, RandomAccessIter middle, RandomAccessIter last, Compare comp)
+{
+    // [TODO] heap sort should be used here, but we simply use bubble sort now
+    for (RandomAccessIter i = first; i < middle; ++i)
+    {
+        // bubble sort
+        for (RandomAccessIter j = last - 1; j > first; --j)
+        {
+            if (comp(*j, *(j - 1)))
+            {
+                swap(*j, *(j - 1));
+            }
+        }
+    }
+}
+
+template<typename T>
+struct vector
+{
+    vector()
+        : data_(0), size_(0), capacity_(0)
+    {
+    }
+    vector(const size_t new_size, const T& value = T())
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(new_size, value);
+    }
+    ~vector()
+    {
+        clear();
+    }
+    vector(const vector& v)
+        : data_(0), size_(0), capacity_(0)
+    {
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+    }
+
+    vector& operator=(const vector& v)
+    {
+        if (this == &v)
+        {
+            return *this;
+        }
+        resize(0);
+        resize(v.size());
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i] = v.data_[i];
+        }
+        return *this;
+    }
+
+    void resize(const size_t new_size, const T& value = T())
+    {
+        try_alloc(new_size);
+        if (new_size > size_)
+        {
+            for (size_t i = size_; i < new_size; i++)
+            {
+                new (&data_[i]) T(value);
+            }
+        }
+        else if (new_size < size_)
+        {
+            for (size_t i = new_size; i < size_; i++)
+            {
+                data_[i].~T();
+            }
+        }
+        size_ = new_size;
+    }
+
+    void clear()
+    {
+        for (size_t i = 0; i < size_; i++)
+        {
+            data_[i].~T();
+        }
+        delete[](char*) data_;
+        data_ = 0;
+        size_ = 0;
+        capacity_ = 0;
+    }
+
+    T* data() const
+    {
+        return data_;
+    }
+    size_t size() const
+    {
+        return size_;
+    }
+    T& operator[](size_t i) const
+    {
+        return data_[i];
+    }
+    T* begin() const
+    {
+        return &data_[0];
+    }
+    T* end() const
+    {
+        return &data_[size_];
+    }
+    bool empty() const
+    {
+        return size_ == 0;
+    }
+
+    void push_back(const T& t)
+    {
+        try_alloc(size_ + 1);
+        new (&data_[size_]) T(t);
+        size_++;
+    }
+
+    void insert(T* pos, T* b, T* e)
+    {
+        vector* v = 0;
+        if (b >= begin() && b < end())
+        {
+            //the same vector
+            v = new vector(*this);
+            b = v->begin() + (b - begin());
+            e = v->begin() + (e - begin());
+        }
+        size_t diff = pos - begin();
+        try_alloc(size_ + (e - b));
+        pos = begin() + diff;
+        memmove(pos + (e - b), pos, (end() - pos) * sizeof(T));
+        size_t len = e - b;
+        size_ += len;
+        for (size_t i = 0; i < len; i++)
+        {
+            *pos = *b;
+            pos++;
+            b++;
+        }
+        delete v;
+    }
+
+    T* erase(T* pos)
+    {
+        pos->~T();
+        memmove(pos, pos + 1, (end() - pos - 1) * sizeof(T));
+        size_--;
+        return pos;
+    }
+
+protected:
+    T* data_;
+    size_t size_;
+    size_t capacity_;
+    void try_alloc(size_t new_size)
+    {
+        if (new_size * 3 / 2 > capacity_ / 2)
+        {
+            capacity_ = new_size * 2;
+            T* new_data = (T*)new char[capacity_ * sizeof(T)];
+            memset(new_data, 0, capacity_ * sizeof(T));
+            if (data_)
+            {
+                memmove(new_data, data_, sizeof(T) * size_);
+                delete[](char*) data_;
+            }
+            data_ = new_data;
+        }
+    }
+};
+
+struct NCNN_EXPORT string : public vector<char>
+{
+    string()
+    {
+    }
+    string(const char* str)
+    {
+        size_t len = strlen(str);
+        resize(len);
+        memcpy(data_, str, len);
+    }
+    const char* c_str() const
+    {
+        return (const char*)data_;
+    }
+    bool operator==(const string& str2) const
+    {
+        return strcmp(data_, str2.data_) == 0;
+    }
+    bool operator==(const char* str2) const
+    {
+        return strcmp(data_, str2) == 0;
+    }
+    bool operator!=(const char* str2) const
+    {
+        return strcmp(data_, str2) != 0;
+    }
+    string& operator+=(const string& str1)
+    {
+        insert(end(), str1.begin(), str1.end());
+        return *this;
+    }
+};
+
+inline string operator+(const string& str1, const string& str2)
+{
+    string str(str1);
+    str.insert(str.end(), str2.begin(), str2.end());
+    return str;
+}
+
+} // namespace std
+
+#endif // NCNN_SIMPLESTL_H
--- a/lib_ncnn/ncnn/vulkan_header_fix.h
+++ b/lib_ncnn/ncnn/vulkan_header_fix.h
@@ -0,0 +1,251 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_VULKAN_HEADER_FIX_H
+#define NCNN_VULKAN_HEADER_FIX_H
+
+#include <vulkan/vulkan.h>
+
+// This header contains new structure and function declearation to fix build with old vulkan sdk
+
+#if VK_HEADER_VERSION < 70
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES (VkStructureType)1000094000
+typedef enum VkSubgroupFeatureFlagBits
+{
+    VK_SUBGROUP_FEATURE_BASIC_BIT = 0x00000001,
+    VK_SUBGROUP_FEATURE_VOTE_BIT = 0x00000002,
+    VK_SUBGROUP_FEATURE_ARITHMETIC_BIT = 0x00000004,
+    VK_SUBGROUP_FEATURE_BALLOT_BIT = 0x00000008,
+    VK_SUBGROUP_FEATURE_SHUFFLE_BIT = 0x00000010,
+    VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT = 0x00000020,
+    VK_SUBGROUP_FEATURE_CLUSTERED_BIT = 0x00000040,
+    VK_SUBGROUP_FEATURE_QUAD_BIT = 0x00000080,
+    VK_SUBGROUP_FEATURE_PARTITIONED_BIT_NV = 0x00000100,
+    VK_SUBGROUP_FEATURE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSubgroupFeatureFlagBits;
+typedef VkFlags VkSubgroupFeatureFlags;
+typedef struct VkPhysicalDeviceSubgroupProperties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t subgroupSize;
+    VkShaderStageFlags supportedStages;
+    VkSubgroupFeatureFlags supportedOperations;
+    VkBool32 quadOperationsInAllStages;
+} VkPhysicalDeviceSubgroupProperties;
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES (VkStructureType)1000168000
+#define VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_SUPPORT            (VkStructureType)1000168001
+typedef struct VkPhysicalDeviceMaintenance3Properties
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t maxPerSetDescriptors;
+    VkDeviceSize maxMemoryAllocationSize;
+} VkPhysicalDeviceMaintenance3Properties;
+typedef struct VkDescriptorSetLayoutSupport
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 supported;
+} VkDescriptorSetLayoutSupport;
+typedef VkPhysicalDeviceMaintenance3Properties VkPhysicalDeviceMaintenance3PropertiesKHR;
+typedef VkDescriptorSetLayoutSupport VkDescriptorSetLayoutSupportKHR;
+typedef void(VKAPI_PTR* PFN_vkGetDescriptorSetLayoutSupportKHR)(VkDevice device, const VkDescriptorSetLayoutCreateInfo* pCreateInfo, VkDescriptorSetLayoutSupport* pSupport);
+#endif // VK_HEADER_VERSION < 70
+
+#if VK_HEADER_VERSION < 80
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR (VkStructureType)1000177000
+typedef struct VkPhysicalDevice8BitStorageFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 storageBuffer8BitAccess;
+    VkBool32 uniformAndStorageBuffer8BitAccess;
+    VkBool32 storagePushConstant8;
+} VkPhysicalDevice8BitStorageFeaturesKHR;
+#define VK_STRUCTURE_TYPE_ATTACHMENT_DESCRIPTION_2_KHR  (VkStructureType)1000109000
+#define VK_STRUCTURE_TYPE_ATTACHMENT_REFERENCE_2_KHR    (VkStructureType)1000109001
+#define VK_STRUCTURE_TYPE_SUBPASS_DESCRIPTION_2_KHR     (VkStructureType)1000109002
+#define VK_STRUCTURE_TYPE_SUBPASS_DEPENDENCY_2_KHR      (VkStructureType)1000109003
+#define VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR (VkStructureType)1000109004
+#define VK_STRUCTURE_TYPE_SUBPASS_BEGIN_INFO_KHR        (VkStructureType)1000109005
+#define VK_STRUCTURE_TYPE_SUBPASS_END_INFO_KHR          (VkStructureType)1000109006
+typedef struct VkAttachmentDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkAttachmentDescriptionFlags flags;
+    VkFormat format;
+    VkSampleCountFlagBits samples;
+    VkAttachmentLoadOp loadOp;
+    VkAttachmentStoreOp storeOp;
+    VkAttachmentLoadOp stencilLoadOp;
+    VkAttachmentStoreOp stencilStoreOp;
+    VkImageLayout initialLayout;
+    VkImageLayout finalLayout;
+} VkAttachmentDescription2KHR;
+typedef struct VkAttachmentReference2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t attachment;
+    VkImageLayout layout;
+    VkImageAspectFlags aspectMask;
+} VkAttachmentReference2KHR;
+typedef struct VkSubpassDescription2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassDescriptionFlags flags;
+    VkPipelineBindPoint pipelineBindPoint;
+    uint32_t viewMask;
+    uint32_t inputAttachmentCount;
+    const VkAttachmentReference2KHR* pInputAttachments;
+    uint32_t colorAttachmentCount;
+    const VkAttachmentReference2KHR* pColorAttachments;
+    const VkAttachmentReference2KHR* pResolveAttachments;
+    const VkAttachmentReference2KHR* pDepthStencilAttachment;
+    uint32_t preserveAttachmentCount;
+    const uint32_t* pPreserveAttachments;
+} VkSubpassDescription2KHR;
+typedef struct VkSubpassDependency2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    uint32_t srcSubpass;
+    uint32_t dstSubpass;
+    VkPipelineStageFlags srcStageMask;
+    VkPipelineStageFlags dstStageMask;
+    VkAccessFlags srcAccessMask;
+    VkAccessFlags dstAccessMask;
+    VkDependencyFlags dependencyFlags;
+    int32_t viewOffset;
+} VkSubpassDependency2KHR;
+typedef struct VkRenderPassCreateInfo2KHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkRenderPassCreateFlags flags;
+    uint32_t attachmentCount;
+    const VkAttachmentDescription2KHR* pAttachments;
+    uint32_t subpassCount;
+    const VkSubpassDescription2KHR* pSubpasses;
+    uint32_t dependencyCount;
+    const VkSubpassDependency2KHR* pDependencies;
+    uint32_t correlatedViewMaskCount;
+    const uint32_t* pCorrelatedViewMasks;
+} VkRenderPassCreateInfo2KHR;
+typedef struct VkSubpassBeginInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+    VkSubpassContents contents;
+} VkSubpassBeginInfoKHR;
+
+typedef struct VkSubpassEndInfoKHR
+{
+    VkStructureType sType;
+    const void* pNext;
+} VkSubpassEndInfoKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkCreateRenderPass2KHR)(VkDevice device, const VkRenderPassCreateInfo2KHR* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkRenderPass* pRenderPass);
+typedef void(VKAPI_PTR* PFN_vkCmdBeginRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkRenderPassBeginInfo* pRenderPassBegin, const VkSubpassBeginInfoKHR* pSubpassBeginInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdNextSubpass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassBeginInfoKHR* pSubpassBeginInfo, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+typedef void(VKAPI_PTR* PFN_vkCmdEndRenderPass2KHR)(VkCommandBuffer commandBuffer, const VkSubpassEndInfoKHR* pSubpassEndInfo);
+#endif // VK_HEADER_VERSION < 80
+
+#if VK_HEADER_VERSION < 95
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR (VkStructureType)1000082000
+typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 shaderFloat16;
+    VkBool32 shaderInt8;
+} VkPhysicalDeviceFloat16Int8FeaturesKHR;
+#endif // VK_HEADER_VERSION < 95
+
+#if VK_HEADER_VERSION < 97
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT (VkStructureType)1000237000
+typedef struct VkPhysicalDeviceMemoryBudgetPropertiesEXT
+{
+    VkStructureType sType;
+    void* pNext;
+    VkDeviceSize heapBudget[VK_MAX_MEMORY_HEAPS];
+    VkDeviceSize heapUsage[VK_MAX_MEMORY_HEAPS];
+} VkPhysicalDeviceMemoryBudgetPropertiesEXT;
+#endif // VK_HEADER_VERSION < 97
+
+#if VK_HEADER_VERSION < 101
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV   (VkStructureType)1000249000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV                 (VkStructureType)1000249001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_NV (VkStructureType)1000249002
+typedef enum VkComponentTypeNV
+{
+    VK_COMPONENT_TYPE_FLOAT16_NV = 0,
+    VK_COMPONENT_TYPE_FLOAT32_NV = 1,
+    VK_COMPONENT_TYPE_FLOAT64_NV = 2,
+    VK_COMPONENT_TYPE_SINT8_NV = 3,
+    VK_COMPONENT_TYPE_SINT16_NV = 4,
+    VK_COMPONENT_TYPE_SINT32_NV = 5,
+    VK_COMPONENT_TYPE_SINT64_NV = 6,
+    VK_COMPONENT_TYPE_UINT8_NV = 7,
+    VK_COMPONENT_TYPE_UINT16_NV = 8,
+    VK_COMPONENT_TYPE_UINT32_NV = 9,
+    VK_COMPONENT_TYPE_UINT64_NV = 10,
+    VK_COMPONENT_TYPE_BEGIN_RANGE_NV = VK_COMPONENT_TYPE_FLOAT16_NV,
+    VK_COMPONENT_TYPE_END_RANGE_NV = VK_COMPONENT_TYPE_UINT64_NV,
+    VK_COMPONENT_TYPE_RANGE_SIZE_NV = (VK_COMPONENT_TYPE_UINT64_NV - VK_COMPONENT_TYPE_FLOAT16_NV + 1),
+    VK_COMPONENT_TYPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkComponentTypeNV;
+typedef enum VkScopeNV
+{
+    VK_SCOPE_DEVICE_NV = 1,
+    VK_SCOPE_WORKGROUP_NV = 2,
+    VK_SCOPE_SUBGROUP_NV = 3,
+    VK_SCOPE_QUEUE_FAMILY_NV = 5,
+    VK_SCOPE_BEGIN_RANGE_NV = VK_SCOPE_DEVICE_NV,
+    VK_SCOPE_END_RANGE_NV = VK_SCOPE_QUEUE_FAMILY_NV,
+    VK_SCOPE_RANGE_SIZE_NV = (VK_SCOPE_QUEUE_FAMILY_NV - VK_SCOPE_DEVICE_NV + 1),
+    VK_SCOPE_MAX_ENUM_NV = 0x7FFFFFFF
+} VkScopeNV;
+typedef struct VkCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeNV AType;
+    VkComponentTypeNV BType;
+    VkComponentTypeNV CType;
+    VkComponentTypeNV DType;
+    VkScopeNV scope;
+} VkCooperativeMatrixPropertiesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesNV;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesNV
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesNV;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesNV* pProperties);
+#endif // VK_HEADER_VERSION < 101
+
+#endif // NCNN_VULKAN_HEADER_FIX_H