Time taken in Inference session

This is my code:
var graph = Graph(in_types=List[Type](TensorType(DType.float32, (N * (H - K + 1) * (W - K + 1)),K * K * C_in), TensorType(DType.float32, K * K * C_in,C_out), TensorType(DType.float32, bias.shape()[0])))
var out = graph[0] @ graph[1]
var out1 = out + graph[2]
var out2 = ops.relu(out1)
graph.output(out2)
graph.verify()

var start_time = now()
var session = engine.InferenceSession()
var model = session.load(graph)
var end_time = now()
var execution_time : Float32 = (end_time - start_time)
var execution_time_seconds : Float32 = execution_time / 1000000000
print("execution_time_seconds for creating inference session and loadin graph:", execution_time_seconds)

var out_names = model.get_model_output_names()
var start_time1 = now()
var ret = model.execute("input0", A, "input1", wow, "input2", bias)
var end_time1 = now()
var execution_time1 : Float32 = (end_time1 - start_time1)
var execution_time_seconds1 : Float32 = execution_time1 / 1000000000
print("execution_time_seconds1 running the inference:", execution_time_seconds1)
var graph = Graph(in_types=List[Type](TensorType(DType.float32, (N * (H - K + 1) * (W - K + 1)),K * K * C_in), TensorType(DType.float32, K * K * C_in,C_out), TensorType(DType.float32, bias.shape()[0])))
var out = graph[0] @ graph[1]
var out1 = out + graph[2]
var out2 = ops.relu(out1)
graph.output(out2)
graph.verify()

var start_time = now()
var session = engine.InferenceSession()
var model = session.load(graph)
var end_time = now()
var execution_time : Float32 = (end_time - start_time)
var execution_time_seconds : Float32 = execution_time / 1000000000
print("execution_time_seconds for creating inference session and loadin graph:", execution_time_seconds)

var out_names = model.get_model_output_names()
var start_time1 = now()
var ret = model.execute("input0", A, "input1", wow, "input2", bias)
var end_time1 = now()
var execution_time1 : Float32 = (end_time1 - start_time1)
var execution_time_seconds1 : Float32 = execution_time1 / 1000000000
print("execution_time_seconds1 running the inference:", execution_time_seconds1)
Output:
execution_time_seconds for creating inference session and loadin graph: 4.304471492767334
execution_time_seconds1 running the inference: 0.00044964000699110329
execution_time_seconds for creating inference session and loadin graph: 4.304471492767334
execution_time_seconds1 running the inference: 0.00044964000699110329
Why creating an inference session and loading the graph into the inference session to create model taking a lot of time i.e., around 4 seconds? Also is there a work around to reduce this time as I have to create different models for my use case. MAX version: max 24.3.0 (9882e19d)
2 Replies
Ehsan M. Kermani (Modular)
That's the time spent on compiling the graph. We'll soon exposing caching so that subsequent runs will become much faster.
taalhaataahir01022001
I'm new and sorry for asking such basic questions. 1. I am compiling my code using mojo i.e.,
mojo build matmul.mojo
mojo build matmul.mojo
and then running the executable. Does this mean that the compilation of the graph using the MAX Engine compiler occurs at runtime and that optimizations on the graph are also performed during runtime? 2. During execution, does the Mojo compiler invoke the MAX Engine compiler to compile and optimize the graph? 3. I have written my entire model in Mojo. Now, I am planning to break down my model into custom operations in MAX Graph. Will this provide performance benefits since the MAX Engine compiler might further optimize my custom operations written in Mojo?
Want results from more Discord servers?
Add your server