Enthernet Code
Enthernet Code
DIIDevHeads IoT Integration Server
Created by Enthernet Code on 6/24/2024 in #middleware-and-os
How do I optimize memory usage for a neural network running on an ARM Cortex-M4 using CMSIS-NN?
Thanks, this was helpful I tried it out 👇
#include "arm_nnfunctions.h"

#define INTERMEDIATE_SIZE 1024
#define OUTPUT_SIZE 512
#define MAX_BUFFER_SIZE ((INTERMEDIATE_SIZE > OUTPUT_SIZE) ? INTERMEDIATE_SIZE : OUTPUT_SIZE)

void run_nn(const q7_t* input_data) {
// Use a single buffer for both intermediate and output data
q7_t shared_buffer[MAX_BUFFER_SIZE];

// Run the network layers using the shared buffer
arm_convolve_HWC_q7_basic(input_data, CONV1_WEIGHT, CONV1_BIAS, shared_buffer);

// Continue with other layers, reusing the shared buffer
// For example:
// arm_fully_connected_q7(shared_buffer, FC1_WEIGHT, FC1_BIAS, shared_buffer);

// Copy final output to the output data buffer if needed
q7_t output_data[OUTPUT_SIZE];
memcpy(output_data, shared_buffer, OUTPUT_SIZE * sizeof(q7_t));
}
#include "arm_nnfunctions.h"

#define INTERMEDIATE_SIZE 1024
#define OUTPUT_SIZE 512
#define MAX_BUFFER_SIZE ((INTERMEDIATE_SIZE > OUTPUT_SIZE) ? INTERMEDIATE_SIZE : OUTPUT_SIZE)

void run_nn(const q7_t* input_data) {
// Use a single buffer for both intermediate and output data
q7_t shared_buffer[MAX_BUFFER_SIZE];

// Run the network layers using the shared buffer
arm_convolve_HWC_q7_basic(input_data, CONV1_WEIGHT, CONV1_BIAS, shared_buffer);

// Continue with other layers, reusing the shared buffer
// For example:
// arm_fully_connected_q7(shared_buffer, FC1_WEIGHT, FC1_BIAS, shared_buffer);

// Copy final output to the output data buffer if needed
q7_t output_data[OUTPUT_SIZE];
memcpy(output_data, shared_buffer, OUTPUT_SIZE * sizeof(q7_t));
}
6 replies