diff --git "a/segmentation_1_4_0_fp32_combined/vaiml_par_0/0/par.subgraph.pre-dse.mlir" "b/segmentation_1_4_0_fp32_combined/vaiml_par_0/0/par.subgraph.pre-dse.mlir"
new file mode 100644--- /dev/null
+++ "b/segmentation_1_4_0_fp32_combined/vaiml_par_0/0/par.subgraph.pre-dse.mlir"
@@ -0,0 +1,25253 @@
+#loc = loc(unknown)
+module attributes {
+  llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128",
+  llvm.target_triple = "x86_64-unknown-linux-gnu",
+  "onnx-mlir.symbol-postfix" = "onnxmodel.onnx.mlir",
+  vaimlconf.device = "stx",
+  vaimlconf.device_models = "${vaimlconf.install_dir}/data/deviceModels",
+  vaimlconf.install_dir = "/usr/local/lib/python3.10/dist-packages/flexml/flexml_extras",
+  vaimlconf.library_metadata = ["${vaimlconf.install_dir}/data/libraryMetadata/L1", "${vaimlconf.install_dir}/data/libraryMetadata/L2", "${vaimlconf.install_dir}/../../vitis_mllib/L1/metadata", "${vaimlconf.install_dir}/../../vitis_mllib/L2/metadata", "${vaimlconf.install_dir}/share/microkernel-tiling/tiling-recipe-specs"],
+  vaimlconf.single_core_compiler = "chess"} {
+  func.func private @forward(%arg0: tensor<1x180x320x4xbf16> loc(unknown), %arg1: tensor<1x16x90x160xbf16> loc(unknown), %arg2: tensor<1x20x45x80xbf16> loc(unknown), %arg3: tensor<1x40x23x40xbf16> loc(unknown), %arg4: tensor<1x64x12x20xbf16> loc(unknown)) -> (tensor<1x16x90x160xbf16>, tensor<1x20x45x80xbf16>, tensor<1x40x23x40xbf16>, tensor<1x64x12x20xbf16>, tensor<1x3x180x320xbf16>, tensor<1x1x180x320xbf16>) attributes {
+    max_heap_size = 2240 : ui32,
+    max_stack_size = 2368 : ui32,
+    stack_heap_start_address = 45696 : ui32,
+    total_stack_heap_region_size = 6912 : ui32} {
+    %0 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_443/biases"} -> tensor<4xbf16> loc(#loc)
+    %1 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_443/weights"} -> tensor<4x16x1x1xbf16> loc(#loc)
+    %2 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_441/biases"} -> tensor<16xbf16> loc(#loc)
+    %3 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_441/weights"} -> tensor<16x16x3x3xbf16> loc(#loc)
+    %4 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_439/biases"} -> tensor<16xbf16> loc(#loc)
+    %5 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_439/weights"} -> tensor<16x35x3x3xbf16> loc(#loc)
+    %6 = xten_nn.load_external_const {file = "constants.h5", key = "Sub_431/Constant_0_0"} -> tensor<1x16x90x160xbf16> loc(#loc2)
+    %7 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_428/biases"} -> tensor<16xbf16> loc(#loc)
+    %8 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_428/weights"} -> tensor<16x32x3x3xbf16> loc(#loc)
+    %9 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_423/biases"} -> tensor<32xbf16> loc(#loc)
+    %10 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_423/weights"} -> tensor<32x32x3x3xbf16> loc(#loc)
+    %11 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_419/biases"} -> tensor<32xbf16> loc(#loc)
+    %12 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_419/weights"} -> tensor<32x59x3x3xbf16> loc(#loc)
+    %13 = xten_nn.load_external_const {file = "constants.h5", key = "Sub_411/Constant_0_0"} -> tensor<1x20x45x80xbf16> loc(#loc3)
+    %14 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_408/biases"} -> tensor<20xbf16> loc(#loc)
+    %15 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_408/weights"} -> tensor<20x40x3x3xbf16> loc(#loc)
+    %16 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_403/biases"} -> tensor<40xbf16> loc(#loc)
+    %17 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_403/weights"} -> tensor<40x40x3x3xbf16> loc(#loc)
+    %18 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_399/biases"} -> tensor<40xbf16> loc(#loc)
+    %19 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_399/weights"} -> tensor<40x107x3x3xbf16> loc(#loc)
+    %20 = xten_nn.load_external_const {file = "constants.h5", key = "Sub_385/Constant_0_0"} -> tensor<1x40x23x40xbf16> loc(#loc4)
+    %21 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_382/biases"} -> tensor<40xbf16> loc(#loc)
+    %22 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_382/weights"} -> tensor<40x80x3x3xbf16> loc(#loc)
+    %23 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_377/biases"} -> tensor<80xbf16> loc(#loc)
+    %24 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_377/weights"} -> tensor<80x80x3x3xbf16> loc(#loc)
+    %25 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_373/biases"} -> tensor<80xbf16> loc(#loc)
+    %26 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_373/weights"} -> tensor<80x171x3x3xbf16> loc(#loc)
+    %27 = xten_nn.load_external_const {file = "constants.h5", key = "Sub_359/Constant_0_0"} -> tensor<1x64x12x20xbf16> loc(#loc5)
+    %28 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_356/biases"} -> tensor<64xbf16> loc(#loc)
+    %29 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_356/weights"} -> tensor<64x128x3x3xbf16> loc(#loc)
+    %30 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_351/biases"} -> tensor<128xbf16> loc(#loc)
+    %31 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_351/weights"} -> tensor<128x128x3x3xbf16> loc(#loc)
+    %32 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_340/biases"} -> tensor<128xbf16> loc(#loc)
+    %33 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_340/weights"} -> tensor<128x960x1x1xbf16> loc(#loc)
+    %34 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_343/biases"} -> tensor<128xbf16> loc(#loc)
+    %35 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_343/weights"} -> tensor<128x960x1x1xbf16> loc(#loc)
+    %36 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_331/biases"} -> tensor<960xbf16> loc(#loc)
+    %37 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_331/weights"} -> tensor<960x160x1x1xbf16> loc(#loc)
+    %38 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_329/biases"} -> tensor<160xbf16> loc(#loc)
+    %39 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_329/weights"} -> tensor<160x960x1x1xbf16> loc(#loc)
+    %40 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_320/biases"} -> tensor<960xbf16> loc(#loc)
+    %41 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_320/weights"} -> tensor<960x240x1x1xbf16> loc(#loc)
+    %42 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_318/biases"} -> tensor<240xbf16> loc(#loc)
+    %43 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_318/weights"} -> tensor<240x960x1x1xbf16> loc(#loc)
+    %44 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_308/biases"} -> tensor<960xbf16> loc(#loc)
+    %45 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_308/weights"} -> tensor<960x1x9x9xbf16> loc(#loc)
+    %46 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_299/biases"} -> tensor<960xbf16> loc(#loc)
+    %47 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_299/weights"} -> tensor<960x160x1x1xbf16> loc(#loc)
+    %48 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_297/biases"} -> tensor<160xbf16> loc(#loc)
+    %49 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_297/weights"} -> tensor<160x960x1x1xbf16> loc(#loc)
+    %50 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_288/biases"} -> tensor<960xbf16> loc(#loc)
+    %51 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_288/weights"} -> tensor<960x240x1x1xbf16> loc(#loc)
+    %52 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_286/biases"} -> tensor<240xbf16> loc(#loc)
+    %53 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_286/weights"} -> tensor<240x960x1x1xbf16> loc(#loc)
+    %54 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_276/biases"} -> tensor<960xbf16> loc(#loc)
+    %55 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_276/weights"} -> tensor<960x1x9x9xbf16> loc(#loc)
+    %56 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_267/biases"} -> tensor<960xbf16> loc(#loc)
+    %57 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_267/weights"} -> tensor<960x160x1x1xbf16> loc(#loc)
+    %58 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_266/biases"} -> tensor<160xbf16> loc(#loc)
+    %59 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_266/weights"} -> tensor<160x672x1x1xbf16> loc(#loc)
+    %60 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_257/biases"} -> tensor<672xbf16> loc(#loc)
+    %61 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_257/weights"} -> tensor<672x168x1x1xbf16> loc(#loc)
+    %62 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_255/biases"} -> tensor<168xbf16> loc(#loc)
+    %63 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_255/weights"} -> tensor<168x672x1x1xbf16> loc(#loc)
+    %64 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_245/biases"} -> tensor<672xbf16> loc(#loc)
+    %65 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_245/weights"} -> tensor<672x1x9x9xbf16> loc(#loc)
+    %66 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_236/biases"} -> tensor<672xbf16> loc(#loc)
+    %67 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_236/weights"} -> tensor<672x112x1x1xbf16> loc(#loc)
+    %68 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_234/biases"} -> tensor<112xbf16> loc(#loc)
+    %69 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_234/weights"} -> tensor<112x672x1x1xbf16> loc(#loc)
+    %70 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_225/biases"} -> tensor<672xbf16> loc(#loc)
+    %71 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_225/weights"} -> tensor<672x168x1x1xbf16> loc(#loc)
+    %72 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_223/biases"} -> tensor<168xbf16> loc(#loc)
+    %73 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_223/weights"} -> tensor<168x672x1x1xbf16> loc(#loc)
+    %74 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_213/biases"} -> tensor<672xbf16> loc(#loc)
+    %75 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_213/weights"} -> tensor<672x1x3x3xbf16> loc(#loc)
+    %76 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_204/biases"} -> tensor<672xbf16> loc(#loc)
+    %77 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_204/weights"} -> tensor<672x112x1x1xbf16> loc(#loc)
+    %78 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_203/biases"} -> tensor<112xbf16> loc(#loc)
+    %79 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_203/weights"} -> tensor<112x480x1x1xbf16> loc(#loc)
+    %80 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_194/biases"} -> tensor<480xbf16> loc(#loc)
+    %81 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_194/weights"} -> tensor<480x120x1x1xbf16> loc(#loc)
+    %82 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_192/biases"} -> tensor<120xbf16> loc(#loc)
+    %83 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_192/weights"} -> tensor<120x480x1x1xbf16> loc(#loc)
+    %84 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_182/biases"} -> tensor<480xbf16> loc(#loc)
+    %85 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_182/weights"} -> tensor<480x1x3x3xbf16> loc(#loc)
+    %86 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_173/biases"} -> tensor<480xbf16> loc(#loc)
+    %87 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_173/weights"} -> tensor<480x80x1x1xbf16> loc(#loc)
+    %88 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_171/biases"} -> tensor<80xbf16> loc(#loc)
+    %89 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_171/weights"} -> tensor<80x184x1x1xbf16> loc(#loc)
+    %90 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_162/biases"} -> tensor<184xbf16> loc(#loc)
+    %91 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_162/weights"} -> tensor<184x1x3x3xbf16> loc(#loc)
+    %92 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_153/biases"} -> tensor<184xbf16> loc(#loc)
+    %93 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_153/weights"} -> tensor<184x80x1x1xbf16> loc(#loc)
+    %94 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_151/biases"} -> tensor<80xbf16> loc(#loc)
+    %95 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_151/weights"} -> tensor<80x184x1x1xbf16> loc(#loc)
+    %96 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_142/biases"} -> tensor<184xbf16> loc(#loc)
+    %97 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_142/weights"} -> tensor<184x1x3x3xbf16> loc(#loc)
+    %98 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_133/biases"} -> tensor<184xbf16> loc(#loc)
+    %99 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_133/weights"} -> tensor<184x80x1x1xbf16> loc(#loc)
+    %100 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_131/biases"} -> tensor<80xbf16> loc(#loc)
+    %101 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_131/weights"} -> tensor<80x200x1x1xbf16> loc(#loc)
+    %102 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_122/biases"} -> tensor<200xbf16> loc(#loc)
+    %103 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_122/weights"} -> tensor<200x1x3x3xbf16> loc(#loc)
+    %104 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_113/biases"} -> tensor<200xbf16> loc(#loc)
+    %105 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_113/weights"} -> tensor<200x80x1x1xbf16> loc(#loc)
+    %106 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_112/biases"} -> tensor<80xbf16> loc(#loc)
+    %107 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_112/weights"} -> tensor<80x240x1x1xbf16> loc(#loc)
+    %108 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_103/biases"} -> tensor<240xbf16> loc(#loc)
+    %109 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_103/weights"} -> tensor<240x1x3x3xbf16> loc(#loc)
+    %110 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_94/biases"} -> tensor<240xbf16> loc(#loc)
+    %111 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_94/weights"} -> tensor<240x40x1x1xbf16> loc(#loc)
+    %112 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_92/biases"} -> tensor<40xbf16> loc(#loc)
+    %113 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_92/weights"} -> tensor<40x120x1x1xbf16> loc(#loc)
+    %114 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_83/biases"} -> tensor<120xbf16> loc(#loc)
+    %115 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_83/weights"} -> tensor<120x32x1x1xbf16> loc(#loc)
+    %116 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_81/biases"} -> tensor<32xbf16> loc(#loc)
+    %117 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_81/weights"} -> tensor<32x120x1x1xbf16> loc(#loc)
+    %118 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_78/biases"} -> tensor<120xbf16> loc(#loc)
+    %119 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_78/weights"} -> tensor<120x1x5x5xbf16> loc(#loc)
+    %120 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_76/biases"} -> tensor<120xbf16> loc(#loc)
+    %121 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_76/weights"} -> tensor<120x40x1x1xbf16> loc(#loc)
+    %122 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_74/biases"} -> tensor<40xbf16> loc(#loc)
+    %123 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_74/weights"} -> tensor<40x120x1x1xbf16> loc(#loc)
+    %124 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_65/biases"} -> tensor<120xbf16> loc(#loc)
+    %125 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_65/weights"} -> tensor<120x32x1x1xbf16> loc(#loc)
+    %126 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_63/biases"} -> tensor<32xbf16> loc(#loc)
+    %127 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_63/weights"} -> tensor<32x120x1x1xbf16> loc(#loc)
+    %128 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_60/biases"} -> tensor<120xbf16> loc(#loc)
+    %129 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_60/weights"} -> tensor<120x1x5x5xbf16> loc(#loc)
+    %130 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_58/biases"} -> tensor<120xbf16> loc(#loc)
+    %131 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_58/weights"} -> tensor<120x40x1x1xbf16> loc(#loc)
+    %132 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_57/biases"} -> tensor<40xbf16> loc(#loc)
+    %133 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_57/weights"} -> tensor<40x72x1x1xbf16> loc(#loc)
+    %134 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_48/biases"} -> tensor<72xbf16> loc(#loc)
+    %135 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_48/weights"} -> tensor<72x24x1x1xbf16> loc(#loc)
+    %136 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_46/biases"} -> tensor<24xbf16> loc(#loc)
+    %137 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_46/weights"} -> tensor<24x72x1x1xbf16> loc(#loc)
+    %138 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_43/biases"} -> tensor<72xbf16> loc(#loc)
+    %139 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_43/weights"} -> tensor<72x1x5x5xbf16> loc(#loc)
+    %140 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_41/biases"} -> tensor<72xbf16> loc(#loc)
+    %141 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_41/weights"} -> tensor<72x24x1x1xbf16> loc(#loc)
+    %142 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_39/biases"} -> tensor<24xbf16> loc(#loc)
+    %143 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_39/weights"} -> tensor<24x72x1x1xbf16> loc(#loc)
+    %144 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_37/biases"} -> tensor<72xbf16> loc(#loc)
+    %145 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_37/weights"} -> tensor<72x1x3x3xbf16> loc(#loc)
+    %146 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_35/biases"} -> tensor<72xbf16> loc(#loc)
+    %147 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_35/weights"} -> tensor<72x24x1x1xbf16> loc(#loc)
+    %148 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_34/biases"} -> tensor<24xbf16> loc(#loc)
+    %149 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_34/weights"} -> tensor<24x64x1x1xbf16> loc(#loc)
+    %150 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_32/biases"} -> tensor<64xbf16> loc(#loc)
+    %151 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_32/weights"} -> tensor<64x1x3x3xbf16> loc(#loc)
+    %152 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_30/biases"} -> tensor<64xbf16> loc(#loc)
+    %153 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_30/weights"} -> tensor<64x16x1x1xbf16> loc(#loc)
+    %154 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_28/biases"} -> tensor<16xbf16> loc(#loc)
+    %155 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_28/weights"} -> tensor<16x16x1x1xbf16> loc(#loc)
+    %156 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_26/biases"} -> tensor<16xbf16> loc(#loc)
+    %157 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_26/weights"} -> tensor<16x1x3x3xbf16> loc(#loc)
+    %158 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_17/biases"} -> tensor<16xbf16> loc(#loc)
+    %159 = xten_nn.load_external_const {file = "constants.h5", key = "Conv_17/weights"} -> tensor<16x3x3x3xbf16> loc(#loc)
+    %160 = xten_nn.load_external_const {file = "constants.h5", key = "Div_16/Constant_1_0"} -> tensor<1x3x180x320xbf16> loc(#loc6)
+    %161 = xten_nn.load_external_const {file = "constants.h5", key = "Sub_14/Constant_1_0"} -> tensor<1x3x180x320xbf16> loc(#loc309)
+    %162 = xten_nn.subgraph (%arg5 = %arg0: tensor<1x180x320x4xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_2",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 180, 320, 4]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_2",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 180, 320, 4]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x180x320x4xbf16>)  attributes {
+        LayerName = "Div_2",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 180, 320, 4]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_2",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 180, 320, 4]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.906250e-03 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.906250e-03> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc1)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_2",
+          OutputName = "Div_2",
+          shift = 0 : i8} : (tensor<1x180x320x4xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x180x320x4xbf16> loc(#loc1)
+        xten_nn.output %463 : tensor<1x180x320x4xbf16> loc(#loc1)
+      } -> tensor<1x180x320x4xbf16> loc(#loc1)
+      xten_nn.output %461 : tensor<1x180x320x4xbf16> loc(#loc1)
+    } -> tensor<1x180x320x4xbf16> loc(#loc1)
+    %163 = xten_nn.subgraph (%arg5 = %162: tensor<1x180x320x4xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Slice_7",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 180, 320, 4]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Slice_7",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 180, 320, 3]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "W",
+        config.dim_c = 184 : ui32,
+        config.dim_h = 320 : ui32,
+        config.dim_w = 4 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 3 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 0 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        LayerName = "Slice_7",
+        OutputName = "Slice_7",
+        size = array<i64: 1, 180, 320, 3>,
+        start = array<i64: 0, 0, 0, 0>} : (tensor<1x180x320x4xbf16>) -> tensor<1x180x320x3xbf16> loc(#loc9)
+      xten_nn.output %461 : tensor<1x180x320x3xbf16> loc(#loc9)
+    } -> tensor<1x180x320x3xbf16> loc(#loc9)
+    %164 = xten_nn.subgraph (%arg5 = %163: tensor<1x180x320x3xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#0",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 180, 320, 3]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#1",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<[0, 4, 0, 5]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 180, 320, 3]> : vector<4xindex>
+        }
+      ],
+      Specializes = "BufferPadAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dim_0 = 320 : ui32,
+        config.dim_0_padded = 320 : ui32,
+        config.dim_1 = 23 : ui32,
+        config.dim_1_padded = 23 : ui32,
+        config.dim_2 = 3 : ui32,
+        config.dim_2_padded = 8 : ui32,
+        config.dim_3 = 8 : ui32,
+        config.dim_3_padded = 8 : ui32,
+        config.dtype = "bfloat16"
+      }} {
+      xten_nn.output %arg5 : tensor<1x180x320x3xbf16> loc(#loc10)
+    } -> tensor<1x180x320x3xbf16> loc(#loc10)
+    %165 = xten_nn.subgraph (%arg5 = %164: tensor<1x180x320x3xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#2",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<[0, 4, 0, 5]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 180, 320, 3]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#3",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<[0, 5, 4, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        }
+      ],
+      Specializes = "Transpose4dAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dim_0 = 320 : ui32,
+        config.dim_1 = 23 : ui32,
+        config.dim_2 = 8 : ui32,
+        config.dim_3 = 8 : ui32,
+        config.dtype = "bfloat16",
+        config.perm = 10 : ui32
+      }} {
+      %461 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc11)
+      %462 = tosa.transpose %arg5, %461 : (tensor<1x180x320x3xbf16>, tensor<4xi32>) -> tensor<1x3x180x320xbf16> loc(#loc311)
+      xten_nn.output %462 : tensor<1x3x180x320xbf16> loc(#loc311)
+    } -> tensor<1x3x180x320xbf16> loc(#loc310)
+    %166 = xten_nn.subgraph (%arg5 = %165: tensor<1x3x180x320xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#4",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<[0, 5, 4, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#5",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        }
+      ],
+      Specializes = "BufferUnpadAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dim_0 = 184 : ui32,
+        config.dim_0_unpadded = 180 : ui32,
+        config.dim_1 = 1 : ui32,
+        config.dim_1_unpadded = 1 : ui32,
+        config.dim_2 = 320 : ui32,
+        config.dim_2_unpadded = 320 : ui32,
+        config.dim_3 = 8 : ui32,
+        config.dim_3_unpadded = 8 : ui32,
+        config.dtype = "bfloat16"
+      }} {
+      xten_nn.output %arg5 : tensor<1x3x180x320xbf16> loc(#loc10)
+    } -> tensor<1x3x180x320xbf16> loc(#loc10)
+    %167 = xten_nn.subgraph (%arg5 = %166: tensor<1x3x180x320xbf16>, %arg6 = %161: tensor<1x3x180x320xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Sub_14",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Initializer_398",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x3x180x320xbf16>, %arg8 = %arg6: tensor<1x3x180x320xbf16>)  attributes {
+        LayerName = "Sub_14",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Initializer_398",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.act = 0 : ui8,
+          config.act_type = "LINEAR",
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.add %arg7, %arg8 {LayerName = "Sub_14", OutputName = "Initializer_398"} : (tensor<1x3x180x320xbf16>, tensor<1x3x180x320xbf16>) -> tensor<1x3x180x320xbf16> loc(#loc309)
+        xten_nn.output %462 : tensor<1x3x180x320xbf16> loc(#loc309)
+      } -> tensor<1x3x180x320xbf16> loc(#loc309)
+      xten_nn.output %461 : tensor<1x3x180x320xbf16> loc(#loc309)
+    } -> tensor<1x3x180x320xbf16> loc(#loc309)
+    %168 = xten_nn.subgraph (%arg5 = %167: tensor<1x3x180x320xbf16>, %arg6 = %160: tensor<1x3x180x320xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_16",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_16",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x3x180x320xbf16>, %arg8 = %arg6: tensor<1x3x180x320xbf16>)  attributes {
+        LayerName = "Div_16",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_16",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          OutputName = "Div_16",
+          PartOfLayerName = "Div_16",
+          shift = 0 : i8} : (tensor<1x3x180x320xbf16>, tensor<1x3x180x320xbf16>) -> tensor<1x3x180x320xbf16> loc(#loc6)
+        xten_nn.output %462 : tensor<1x3x180x320xbf16> loc(#loc6)
+      } -> tensor<1x3x180x320xbf16> loc(#loc6)
+      xten_nn.output %461 : tensor<1x3x180x320xbf16> loc(#loc6)
+    } -> tensor<1x3x180x320xbf16> loc(#loc6)
+    %169 = xten_nn.subgraph (%arg5 = %168: tensor<1x3x180x320xbf16>, %arg6 = %159: tensor<16x3x3x3xbf16>, %arg7 = %158: tensor<16xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_17",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[16, 3, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_17",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x3x180x320xbf16>, %arg9 = %arg6: tensor<16x3x3x3xbf16>, %arg10 = %arg7: tensor<16xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 0], [1, 0]],
+        LayerName = "Conv_17",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[16, 3, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_17",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 3 : ui8,
+          config.ksize.width = 3 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 2 : ui8,
+          config.stride_w = 2 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = tosa.transpose %arg9, %463 : (tensor<16x3x3x3xbf16>, tensor<4xi32>) -> tensor<16x3x3x3xbf16> loc(#loc13)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x3x180x320xbf16>, tensor<4xi32>) -> tensor<1x180x320x3xbf16> loc(#loc13)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_17",
+          PartOfOutputName = "Conv_17",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 0, 1, 0>,
+          stride = array<i64: 2, 2>} : (tensor<1x180x320x3xbf16>, tensor<16x3x3x3xbf16>, tensor<16xbf16>) -> tensor<1x90x160x16xbf16> loc(#loc13)
+        %467 = tosa.transpose %466, %462 : (tensor<1x90x160x16xbf16>, tensor<4xi32>) -> tensor<1x16x90x160xbf16> loc(#loc13)
+        xten_nn.output %467 : tensor<1x16x90x160xbf16> loc(#loc13)
+      } -> tensor<1x16x90x160xbf16> loc(#loc13)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc13)
+    } -> tensor<1x16x90x160xbf16> loc(#loc13)
+    %170 = xten_nn.subgraph (%arg5 = %169: tensor<1x16x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_19",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_19",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x16x90x160xbf16>)  attributes {
+        LayerName = "Add_19",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_19",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_19", OutputName = "Add_19"} : (tensor<1x16x90x160xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x16x90x160xbf16> loc(#loc14)
+        xten_nn.output %463 : tensor<1x16x90x160xbf16> loc(#loc14)
+      } -> tensor<1x16x90x160xbf16> loc(#loc14)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc14)
+    } -> tensor<1x16x90x160xbf16> loc(#loc14)
+    %171 = xten_nn.subgraph (%arg5 = %170: tensor<1x16x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_22",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_22",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x16x90x160xbf16>)  attributes {
+        LayerName = "Clip_22",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_22",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_22",
+          OutputName = "Clip_22",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x16x90x160xbf16>) -> tensor<1x16x90x160xbf16> loc(#loc15)
+        xten_nn.output %462 : tensor<1x16x90x160xbf16> loc(#loc15)
+      } -> tensor<1x16x90x160xbf16> loc(#loc15)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc15)
+    } -> tensor<1x16x90x160xbf16> loc(#loc15)
+    %172 = xten_nn.subgraph (%arg5 = %171: tensor<1x16x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_24",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_24",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x16x90x160xbf16>)  attributes {
+        LayerName = "Div_24",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_24",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_24",
+          OutputName = "Div_24",
+          shift = 0 : i8} : (tensor<1x16x90x160xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x16x90x160xbf16> loc(#loc16)
+        xten_nn.output %463 : tensor<1x16x90x160xbf16> loc(#loc16)
+      } -> tensor<1x16x90x160xbf16> loc(#loc16)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc16)
+    } -> tensor<1x16x90x160xbf16> loc(#loc16)
+    %173 = xten_nn.subgraph (%arg5 = %169: tensor<1x16x90x160xbf16>, %arg6 = %172: tensor<1x16x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_25",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_25",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x16x90x160xbf16>, %arg8 = %arg6: tensor<1x16x90x160xbf16>)  attributes {
+        LayerName = "Mul_25",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_25",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_25",
+          OutputName = "Mul_25",
+          shift = 0 : i8} : (tensor<1x16x90x160xbf16>, tensor<1x16x90x160xbf16>) -> tensor<1x16x90x160xbf16> loc(#loc17)
+        xten_nn.output %462 : tensor<1x16x90x160xbf16> loc(#loc17)
+      } -> tensor<1x16x90x160xbf16> loc(#loc17)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc17)
+    } -> tensor<1x16x90x160xbf16> loc(#loc17)
+    %174 = xten_nn.subgraph (%arg5 = %173: tensor<1x16x90x160xbf16>, %arg6 = %157: tensor<16x1x3x3xbf16>, %arg7 = %156: tensor<16xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_26",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "CMHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[16, 1, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_27",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x16x90x160xbf16>, %arg9 = %arg6: tensor<16x1x3x3xbf16>, %arg10 = %arg7: tensor<16xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_26",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "CMHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[16, 1, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_27",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "DepthwiseConv2dBf16",
+        Traits = {
+          NonNegativeOut = true
+        },
+        With = {
+          config.act = 1 : ui8,
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.kernel_height = 3 : ui8,
+          config.kernel_width = 3 : ui8,
+          config.stride = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[2, 3, 0, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc312)
+        %465 = tosa.transpose %arg9, %464 : (tensor<16x1x3x3xbf16>, tensor<4xi32>) -> tensor<3x3x16x1xbf16> loc(#loc312)
+        %466 = tosa.transpose %arg8, %463 : (tensor<1x16x90x160xbf16>, tensor<4xi32>) -> tensor<1x90x160x16xbf16> loc(#loc312)
+        %467 = tosa.depthwise_conv2d %466, %465, %arg10 {
+          PartOfLayerName = "Conv_26",
+          PartOfOutputName = "Conv_26",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x90x160x16xbf16>, tensor<3x3x16x1xbf16>, tensor<16xbf16>) -> tensor<1x90x160x16xbf16> loc(#loc18)
+        %468 = tosa.clamp %467 {
+          LayerName = "Relu_27",
+          OutputName = "Relu_27",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x90x160x16xbf16>) -> tensor<1x90x160x16xbf16> loc(#loc19)
+        %469 = tosa.transpose %468, %462 : (tensor<1x90x160x16xbf16>, tensor<4xi32>) -> tensor<1x16x90x160xbf16> loc(#loc312)
+        xten_nn.output %469 : tensor<1x16x90x160xbf16> loc(#loc19)
+      } -> tensor<1x16x90x160xbf16> loc(#loc312)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc312)
+    } -> tensor<1x16x90x160xbf16> loc(#loc312)
+    %175 = xten_nn.subgraph (%arg5 = %174: tensor<1x16x90x160xbf16>, %arg6 = %155: tensor<16x16x1x1xbf16>, %arg7 = %154: tensor<16xbf16>, %arg8 = %173: tensor<1x16x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index, 3 : index],
+      LayerName = "Conv_28",
+      OfmShare = 3 : index,
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[16, 16, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_29",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg9 = %arg5: tensor<1x16x90x160xbf16>, %arg10 = %arg6: tensor<16x16x1x1xbf16>, %arg11 = %arg7: tensor<16xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_28",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[16, 16, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_28",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %463 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc20)
+        %465 = tosa.reshape %arg10 {new_shape = array<i64: 16, 1, 1, 16>} : (tensor<16x16x1x1xbf16>) -> tensor<16x1x1x16xbf16> loc(#loc20)
+        %466 = tosa.transpose %arg9, %464 : (tensor<1x16x90x160xbf16>, tensor<4xi32>) -> tensor<1x90x160x16xbf16> loc(#loc20)
+        %467 = tosa.conv2d %466, %465, %arg11 {
+          PartOfLayerName = "Conv_28",
+          PartOfOutputName = "Conv_28",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x90x160x16xbf16>, tensor<16x1x1x16xbf16>, tensor<16xbf16>) -> tensor<1x90x160x16xbf16> loc(#loc20)
+        %468 = tosa.transpose %467, %463 : (tensor<1x90x160x16xbf16>, tensor<4xi32>) -> tensor<1x16x90x160xbf16> loc(#loc20)
+        xten_nn.output %468 : tensor<1x16x90x160xbf16> loc(#loc20)
+      } -> tensor<1x16x90x160xbf16> loc(#loc20)
+      %462 = xten_nn.subgraph (%arg9 = %461: tensor<1x16x90x160xbf16>, %arg10 = %arg8: tensor<1x16x90x160xbf16>)  attributes {
+        LayerName = "Add_29",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_29",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.act = 0 : ui8,
+          config.act_type = "LINEAR",
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %463 = tosa.add %arg9, %arg10 {LayerName = "Add_29", OutputName = "Add_29"} : (tensor<1x16x90x160xbf16>, tensor<1x16x90x160xbf16>) -> tensor<1x16x90x160xbf16> loc(#loc21)
+        xten_nn.output %463 : tensor<1x16x90x160xbf16> loc(#loc21)
+      } -> tensor<1x16x90x160xbf16> loc(#loc21)
+      xten_nn.output %462 : tensor<1x16x90x160xbf16> loc(#loc21)
+    } -> tensor<1x16x90x160xbf16> loc(#loc313)
+    %176 = xten_nn.subgraph (%arg5 = %175: tensor<1x16x90x160xbf16>, %arg6 = %153: tensor<64x16x1x1xbf16>, %arg7 = %152: tensor<64xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_30",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[64, 16, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_31",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x16x90x160xbf16>, %arg9 = %arg6: tensor<64x16x1x1xbf16>, %arg10 = %arg7: tensor<64xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_30",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[64, 16, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_31",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc314)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 64, 1, 1, 16>} : (tensor<64x16x1x1xbf16>) -> tensor<64x1x1x16xbf16> loc(#loc314)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x16x90x160xbf16>, tensor<4xi32>) -> tensor<1x90x160x16xbf16> loc(#loc314)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_30",
+          PartOfOutputName = "Conv_30",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x90x160x16xbf16>, tensor<64x1x1x16xbf16>, tensor<64xbf16>) -> tensor<1x90x160x64xbf16> loc(#loc22)
+        %467 = tosa.clamp %466 {
+          LayerName = "Relu_31",
+          OutputName = "Relu_31",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x90x160x64xbf16>) -> tensor<1x90x160x64xbf16> loc(#loc23)
+        %468 = tosa.transpose %467, %462 : (tensor<1x90x160x64xbf16>, tensor<4xi32>) -> tensor<1x64x90x160xbf16> loc(#loc314)
+        xten_nn.output %468 : tensor<1x64x90x160xbf16> loc(#loc23)
+      } -> tensor<1x64x90x160xbf16> loc(#loc314)
+      xten_nn.output %461 : tensor<1x64x90x160xbf16> loc(#loc314)
+    } -> tensor<1x64x90x160xbf16> loc(#loc314)
+    %177 = xten_nn.subgraph (%arg5 = %176: tensor<1x64x90x160xbf16>, %arg6 = %151: tensor<64x1x3x3xbf16>, %arg7 = %150: tensor<64xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_32",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 90, 160]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "CMHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[64, 1, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_33",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x64x90x160xbf16>, %arg9 = %arg6: tensor<64x1x3x3xbf16>, %arg10 = %arg7: tensor<64xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 0], [1, 0]],
+        LayerName = "Conv_32",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 90, 160]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "CMHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[64, 1, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_33",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "DepthwiseConv2dBf16",
+        Traits = {
+          NonNegativeOut = true
+        },
+        With = {
+          config.act = 1 : ui8,
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.kernel_height = 3 : ui8,
+          config.kernel_width = 3 : ui8,
+          config.stride = 2 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[2, 3, 0, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc315)
+        %465 = tosa.transpose %arg9, %464 : (tensor<64x1x3x3xbf16>, tensor<4xi32>) -> tensor<3x3x64x1xbf16> loc(#loc315)
+        %466 = tosa.transpose %arg8, %463 : (tensor<1x64x90x160xbf16>, tensor<4xi32>) -> tensor<1x90x160x64xbf16> loc(#loc315)
+        %467 = tosa.depthwise_conv2d %466, %465, %arg10 {
+          PartOfLayerName = "Conv_32",
+          PartOfOutputName = "Conv_32",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 0, 1, 0>,
+          stride = array<i64: 2, 2>} : (tensor<1x90x160x64xbf16>, tensor<3x3x64x1xbf16>, tensor<64xbf16>) -> tensor<1x45x80x64xbf16> loc(#loc24)
+        %468 = tosa.clamp %467 {
+          LayerName = "Relu_33",
+          OutputName = "Relu_33",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x45x80x64xbf16>) -> tensor<1x45x80x64xbf16> loc(#loc25)
+        %469 = tosa.transpose %468, %462 : (tensor<1x45x80x64xbf16>, tensor<4xi32>) -> tensor<1x64x45x80xbf16> loc(#loc315)
+        xten_nn.output %469 : tensor<1x64x45x80xbf16> loc(#loc25)
+      } -> tensor<1x64x45x80xbf16> loc(#loc315)
+      xten_nn.output %461 : tensor<1x64x45x80xbf16> loc(#loc315)
+    } -> tensor<1x64x45x80xbf16> loc(#loc315)
+    %178 = xten_nn.subgraph (%arg5 = %177: tensor<1x64x45x80xbf16>, %arg6 = %149: tensor<24x64x1x1xbf16>, %arg7 = %148: tensor<24xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_34",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 45, 80]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[24, 64, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_34",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 24, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x64x45x80xbf16>, %arg9 = %arg6: tensor<24x64x1x1xbf16>, %arg10 = %arg7: tensor<24xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_34",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 45, 80]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[24, 64, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_34",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 24, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc26)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 24, 1, 1, 64>} : (tensor<24x64x1x1xbf16>) -> tensor<24x1x1x64xbf16> loc(#loc26)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x64x45x80xbf16>, tensor<4xi32>) -> tensor<1x45x80x64xbf16> loc(#loc26)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_34",
+          PartOfOutputName = "Conv_34",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x45x80x64xbf16>, tensor<24x1x1x64xbf16>, tensor<24xbf16>) -> tensor<1x45x80x24xbf16> loc(#loc26)
+        %467 = tosa.transpose %466, %462 : (tensor<1x45x80x24xbf16>, tensor<4xi32>) -> tensor<1x24x45x80xbf16> loc(#loc26)
+        xten_nn.output %467 : tensor<1x24x45x80xbf16> loc(#loc26)
+      } -> tensor<1x24x45x80xbf16> loc(#loc26)
+      xten_nn.output %461 : tensor<1x24x45x80xbf16> loc(#loc26)
+    } -> tensor<1x24x45x80xbf16> loc(#loc26)
+    %179 = xten_nn.subgraph (%arg5 = %178: tensor<1x24x45x80xbf16>, %arg6 = %147: tensor<72x24x1x1xbf16>, %arg7 = %146: tensor<72xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_35",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 24, 45, 80]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[72, 24, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_36",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x24x45x80xbf16>, %arg9 = %arg6: tensor<72x24x1x1xbf16>, %arg10 = %arg7: tensor<72xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_35",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 24, 45, 80]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[72, 24, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_36",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc316)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 72, 1, 1, 24>} : (tensor<72x24x1x1xbf16>) -> tensor<72x1x1x24xbf16> loc(#loc316)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x24x45x80xbf16>, tensor<4xi32>) -> tensor<1x45x80x24xbf16> loc(#loc316)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_35",
+          PartOfOutputName = "Conv_35",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x45x80x24xbf16>, tensor<72x1x1x24xbf16>, tensor<72xbf16>) -> tensor<1x45x80x72xbf16> loc(#loc27)
+        %467 = tosa.clamp %466 {
+          LayerName = "Relu_36",
+          OutputName = "Relu_36",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x45x80x72xbf16>) -> tensor<1x45x80x72xbf16> loc(#loc28)
+        %468 = tosa.transpose %467, %462 : (tensor<1x45x80x72xbf16>, tensor<4xi32>) -> tensor<1x72x45x80xbf16> loc(#loc316)
+        xten_nn.output %468 : tensor<1x72x45x80xbf16> loc(#loc28)
+      } -> tensor<1x72x45x80xbf16> loc(#loc316)
+      xten_nn.output %461 : tensor<1x72x45x80xbf16> loc(#loc316)
+    } -> tensor<1x72x45x80xbf16> loc(#loc316)
+    %180 = xten_nn.subgraph (%arg5 = %179: tensor<1x72x45x80xbf16>, %arg6 = %145: tensor<72x1x3x3xbf16>, %arg7 = %144: tensor<72xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_37",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 45, 80]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "CMHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[72, 1, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_38",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x72x45x80xbf16>, %arg9 = %arg6: tensor<72x1x3x3xbf16>, %arg10 = %arg7: tensor<72xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_37",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 45, 80]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "CMHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[72, 1, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_38",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "DepthwiseConv2dBf16",
+        Traits = {
+          NonNegativeOut = true
+        },
+        With = {
+          config.act = 1 : ui8,
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.kernel_height = 3 : ui8,
+          config.kernel_width = 3 : ui8,
+          config.stride = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[2, 3, 0, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc317)
+        %465 = tosa.transpose %arg9, %464 : (tensor<72x1x3x3xbf16>, tensor<4xi32>) -> tensor<3x3x72x1xbf16> loc(#loc317)
+        %466 = tosa.transpose %arg8, %463 : (tensor<1x72x45x80xbf16>, tensor<4xi32>) -> tensor<1x45x80x72xbf16> loc(#loc317)
+        %467 = tosa.depthwise_conv2d %466, %465, %arg10 {
+          PartOfLayerName = "Conv_37",
+          PartOfOutputName = "Conv_37",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x45x80x72xbf16>, tensor<3x3x72x1xbf16>, tensor<72xbf16>) -> tensor<1x45x80x72xbf16> loc(#loc29)
+        %468 = tosa.clamp %467 {
+          LayerName = "Relu_38",
+          OutputName = "Relu_38",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x45x80x72xbf16>) -> tensor<1x45x80x72xbf16> loc(#loc30)
+        %469 = tosa.transpose %468, %462 : (tensor<1x45x80x72xbf16>, tensor<4xi32>) -> tensor<1x72x45x80xbf16> loc(#loc317)
+        xten_nn.output %469 : tensor<1x72x45x80xbf16> loc(#loc30)
+      } -> tensor<1x72x45x80xbf16> loc(#loc317)
+      xten_nn.output %461 : tensor<1x72x45x80xbf16> loc(#loc317)
+    } -> tensor<1x72x45x80xbf16> loc(#loc317)
+    %181 = xten_nn.subgraph (%arg5 = %180: tensor<1x72x45x80xbf16>, %arg6 = %143: tensor<24x72x1x1xbf16>, %arg7 = %142: tensor<24xbf16>, %arg8 = %178: tensor<1x24x45x80xbf16>)  attributes {
+      IfmOperands = [0 : index, 3 : index],
+      LayerName = "Conv_39",
+      OfmShare = 3 : index,
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 45, 80]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[24, 72, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 24, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_40",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 24, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg9 = %arg5: tensor<1x72x45x80xbf16>, %arg10 = %arg6: tensor<24x72x1x1xbf16>, %arg11 = %arg7: tensor<24xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_39",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 45, 80]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[24, 72, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_39",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 24, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %463 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc31)
+        %465 = tosa.reshape %arg10 {new_shape = array<i64: 24, 1, 1, 72>} : (tensor<24x72x1x1xbf16>) -> tensor<24x1x1x72xbf16> loc(#loc31)
+        %466 = tosa.transpose %arg9, %464 : (tensor<1x72x45x80xbf16>, tensor<4xi32>) -> tensor<1x45x80x72xbf16> loc(#loc31)
+        %467 = tosa.conv2d %466, %465, %arg11 {
+          PartOfLayerName = "Conv_39",
+          PartOfOutputName = "Conv_39",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x45x80x72xbf16>, tensor<24x1x1x72xbf16>, tensor<24xbf16>) -> tensor<1x45x80x24xbf16> loc(#loc31)
+        %468 = tosa.transpose %467, %463 : (tensor<1x45x80x24xbf16>, tensor<4xi32>) -> tensor<1x24x45x80xbf16> loc(#loc31)
+        xten_nn.output %468 : tensor<1x24x45x80xbf16> loc(#loc31)
+      } -> tensor<1x24x45x80xbf16> loc(#loc31)
+      %462 = xten_nn.subgraph (%arg9 = %461: tensor<1x24x45x80xbf16>, %arg10 = %arg8: tensor<1x24x45x80xbf16>)  attributes {
+        LayerName = "Add_40",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 24, 45, 80]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 24, 45, 80]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_40",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 24, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.act = 0 : ui8,
+          config.act_type = "LINEAR",
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %463 = tosa.add %arg9, %arg10 {LayerName = "Add_40", OutputName = "Add_40"} : (tensor<1x24x45x80xbf16>, tensor<1x24x45x80xbf16>) -> tensor<1x24x45x80xbf16> loc(#loc32)
+        xten_nn.output %463 : tensor<1x24x45x80xbf16> loc(#loc32)
+      } -> tensor<1x24x45x80xbf16> loc(#loc32)
+      xten_nn.output %462 : tensor<1x24x45x80xbf16> loc(#loc32)
+    } -> tensor<1x24x45x80xbf16> loc(#loc318)
+    %182 = xten_nn.subgraph (%arg5 = %181: tensor<1x24x45x80xbf16>, %arg6 = %141: tensor<72x24x1x1xbf16>, %arg7 = %140: tensor<72xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_41",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 24, 45, 80]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[72, 24, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_42",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x24x45x80xbf16>, %arg9 = %arg6: tensor<72x24x1x1xbf16>, %arg10 = %arg7: tensor<72xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_41",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 24, 45, 80]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[72, 24, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_42",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc319)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 72, 1, 1, 24>} : (tensor<72x24x1x1xbf16>) -> tensor<72x1x1x24xbf16> loc(#loc319)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x24x45x80xbf16>, tensor<4xi32>) -> tensor<1x45x80x24xbf16> loc(#loc319)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_41",
+          PartOfOutputName = "Conv_41",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x45x80x24xbf16>, tensor<72x1x1x24xbf16>, tensor<72xbf16>) -> tensor<1x45x80x72xbf16> loc(#loc33)
+        %467 = tosa.clamp %466 {
+          LayerName = "Relu_42",
+          OutputName = "Relu_42",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x45x80x72xbf16>) -> tensor<1x45x80x72xbf16> loc(#loc34)
+        %468 = tosa.transpose %467, %462 : (tensor<1x45x80x72xbf16>, tensor<4xi32>) -> tensor<1x72x45x80xbf16> loc(#loc319)
+        xten_nn.output %468 : tensor<1x72x45x80xbf16> loc(#loc34)
+      } -> tensor<1x72x45x80xbf16> loc(#loc319)
+      xten_nn.output %461 : tensor<1x72x45x80xbf16> loc(#loc319)
+    } -> tensor<1x72x45x80xbf16> loc(#loc319)
+    %183 = xten_nn.subgraph (%arg5 = %182: tensor<1x72x45x80xbf16>, %arg6 = %139: tensor<72x1x5x5xbf16>, %arg7 = %138: tensor<72xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_43",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 45, 80]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "CMHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[72, 1, 5, 5]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_44",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x72x45x80xbf16>, %arg9 = %arg6: tensor<72x1x5x5xbf16>, %arg10 = %arg7: tensor<72xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[2, 2], [2, 1]],
+        LayerName = "Conv_43",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 45, 80]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "CMHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[72, 1, 5, 5]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_44",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "DepthwiseConv2dBf16",
+        Traits = {
+          NonNegativeOut = true
+        },
+        With = {
+          config.act = 1 : ui8,
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.kernel_height = 5 : ui8,
+          config.kernel_width = 5 : ui8,
+          config.stride = 2 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[2, 3, 0, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc320)
+        %465 = tosa.transpose %arg9, %464 : (tensor<72x1x5x5xbf16>, tensor<4xi32>) -> tensor<5x5x72x1xbf16> loc(#loc320)
+        %466 = tosa.transpose %arg8, %463 : (tensor<1x72x45x80xbf16>, tensor<4xi32>) -> tensor<1x45x80x72xbf16> loc(#loc320)
+        %467 = tosa.depthwise_conv2d %466, %465, %arg10 {
+          PartOfLayerName = "Conv_43",
+          PartOfOutputName = "Conv_43",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 2, 2, 2, 1>,
+          stride = array<i64: 2, 2>} : (tensor<1x45x80x72xbf16>, tensor<5x5x72x1xbf16>, tensor<72xbf16>) -> tensor<1x23x40x72xbf16> loc(#loc35)
+        %468 = tosa.clamp %467 {
+          LayerName = "Relu_44",
+          OutputName = "Relu_44",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x23x40x72xbf16>) -> tensor<1x23x40x72xbf16> loc(#loc36)
+        %469 = tosa.transpose %468, %462 : (tensor<1x23x40x72xbf16>, tensor<4xi32>) -> tensor<1x72x23x40xbf16> loc(#loc320)
+        xten_nn.output %469 : tensor<1x72x23x40xbf16> loc(#loc36)
+      } -> tensor<1x72x23x40xbf16> loc(#loc320)
+      xten_nn.output %461 : tensor<1x72x23x40xbf16> loc(#loc320)
+    } -> tensor<1x72x23x40xbf16> loc(#loc320)
+    %184 = xten_nn.subgraph (%arg5 = %183: tensor<1x72x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#6",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#7",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 1, 920]> : vector<4xindex>
+        }
+      ],
+      Specializes = "Transpose4dAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dim_0 = 23 : ui32,
+        config.dim_1 = 9 : ui32,
+        config.dim_2 = 40 : ui32,
+        config.dim_3 = 8 : ui32,
+        config.dtype = "bfloat16",
+        config.perm = 6 : ui32
+      }} {
+      %461 = tosa.reshape %arg5 {new_shape = array<i64: 1, 72, 1, 920>} : (tensor<1x72x23x40xbf16>) -> tensor<1x72x1x920xbf16> loc(#loc37)
+      xten_nn.output %461 : tensor<1x72x1x920xbf16> loc(#loc37)
+    } -> tensor<1x72x1x920xbf16> loc(#loc37)
+    %185 = xten_nn.subgraph (%arg5 = %184: tensor<1x72x1x920xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#8",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 1, 920]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#9",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x72x1x920xbf16>)  attributes {
+        LayerName = "Generated-#8",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 1, 920]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Generated-#9",
+        PadValue = 0.000000e+00 : bf16,
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ReduceMeanC8Bf16",
+        Traits = {
+          Reduce = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.full_channel = 72 : ui32,
+          config.full_height = 1 : ui32,
+          config.full_width = 920 : ui32,
+          config.reduce_dim = "W"
+        }} {
+        %462 = xten_nn.reduce_mean %arg6 {axes = array<i64: 3>, keepdims = 1 : i64} : (tensor<1x72x1x920xbf16>) -> tensor<1x72x1x1xbf16> loc(#loc37)
+        xten_nn.output %462 : tensor<1x72x1x1xbf16> loc(#loc37)
+      } -> tensor<1x72x1x1xbf16> loc(#loc37)
+      xten_nn.output %461 : tensor<1x72x1x1xbf16> loc(#loc37)
+    } -> tensor<1x72x1x1xbf16> loc(#loc37)
+    %186 = xten_nn.subgraph (%arg5 = %185: tensor<1x72x1x1xbf16>, %arg6 = %137: tensor<24x72x1x1xbf16>, %arg7 = %136: tensor<24xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_46",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[24, 72, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_47",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 24, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x72x1x1xbf16>, %arg9 = %arg6: tensor<24x72x1x1xbf16>, %arg10 = %arg7: tensor<24xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_46",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[24, 72, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_47",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 24, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 24, 1, 1, 72>} : (tensor<24x72x1x1xbf16>) -> tensor<24x1x1x72xbf16> loc(#loc321)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 72>} : (tensor<1x72x1x1xbf16>) -> tensor<1x1x1x72xbf16> loc(#loc321)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_46",
+          PartOfOutputName = "Conv_46",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x72xbf16>, tensor<24x1x1x72xbf16>, tensor<24xbf16>) -> tensor<1x1x1x24xbf16> loc(#loc38)
+        %465 = tosa.clamp %464 {
+          LayerName = "Relu_47",
+          OutputName = "Relu_47",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x1x1x24xbf16>) -> tensor<1x1x1x24xbf16> loc(#loc39)
+        %466 = tosa.reshape %465 {new_shape = array<i64: 1, 24, 1, 1>} : (tensor<1x1x1x24xbf16>) -> tensor<1x24x1x1xbf16> loc(#loc321)
+        xten_nn.output %466 : tensor<1x24x1x1xbf16> loc(#loc39)
+      } -> tensor<1x24x1x1xbf16> loc(#loc321)
+      xten_nn.output %461 : tensor<1x24x1x1xbf16> loc(#loc321)
+    } -> tensor<1x24x1x1xbf16> loc(#loc321)
+    %187 = xten_nn.subgraph (%arg5 = %186: tensor<1x24x1x1xbf16>, %arg6 = %135: tensor<72x24x1x1xbf16>, %arg7 = %134: tensor<72xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_48",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 24, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[72, 24, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_48",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x24x1x1xbf16>, %arg9 = %arg6: tensor<72x24x1x1xbf16>, %arg10 = %arg7: tensor<72xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_48",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 24, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[72, 24, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_48",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 72, 1, 1, 24>} : (tensor<72x24x1x1xbf16>) -> tensor<72x1x1x24xbf16> loc(#loc40)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 24>} : (tensor<1x24x1x1xbf16>) -> tensor<1x1x1x24xbf16> loc(#loc40)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_48",
+          PartOfOutputName = "Conv_48",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x24xbf16>, tensor<72x1x1x24xbf16>, tensor<72xbf16>) -> tensor<1x1x1x72xbf16> loc(#loc40)
+        %465 = tosa.reshape %464 {new_shape = array<i64: 1, 72, 1, 1>} : (tensor<1x1x1x72xbf16>) -> tensor<1x72x1x1xbf16> loc(#loc40)
+        xten_nn.output %465 : tensor<1x72x1x1xbf16> loc(#loc40)
+      } -> tensor<1x72x1x1xbf16> loc(#loc40)
+      xten_nn.output %461 : tensor<1x72x1x1xbf16> loc(#loc40)
+    } -> tensor<1x72x1x1xbf16> loc(#loc40)
+    %188 = xten_nn.subgraph (%arg5 = %187: tensor<1x72x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_50",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_50",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x72x1x1xbf16>)  attributes {
+        LayerName = "Add_50",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_50",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_50", OutputName = "Add_50"} : (tensor<1x72x1x1xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x72x1x1xbf16> loc(#loc41)
+        xten_nn.output %463 : tensor<1x72x1x1xbf16> loc(#loc41)
+      } -> tensor<1x72x1x1xbf16> loc(#loc41)
+      xten_nn.output %461 : tensor<1x72x1x1xbf16> loc(#loc41)
+    } -> tensor<1x72x1x1xbf16> loc(#loc41)
+    %189 = xten_nn.subgraph (%arg5 = %188: tensor<1x72x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_53",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_53",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x72x1x1xbf16>)  attributes {
+        LayerName = "Clip_53",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_53",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_53",
+          OutputName = "Clip_53",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x72x1x1xbf16>) -> tensor<1x72x1x1xbf16> loc(#loc42)
+        xten_nn.output %462 : tensor<1x72x1x1xbf16> loc(#loc42)
+      } -> tensor<1x72x1x1xbf16> loc(#loc42)
+      xten_nn.output %461 : tensor<1x72x1x1xbf16> loc(#loc42)
+    } -> tensor<1x72x1x1xbf16> loc(#loc42)
+    %190 = xten_nn.subgraph (%arg5 = %189: tensor<1x72x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_55",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_55",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x72x1x1xbf16>)  attributes {
+        LayerName = "Div_55",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_55",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_55",
+          OutputName = "Div_55",
+          shift = 0 : i8} : (tensor<1x72x1x1xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x72x1x1xbf16> loc(#loc43)
+        xten_nn.output %463 : tensor<1x72x1x1xbf16> loc(#loc43)
+      } -> tensor<1x72x1x1xbf16> loc(#loc43)
+      xten_nn.output %461 : tensor<1x72x1x1xbf16> loc(#loc43)
+    } -> tensor<1x72x1x1xbf16> loc(#loc43)
+    %191 = xten_nn.subgraph (%arg5 = %190: tensor<1x72x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#10",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#11",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 23, 40]> : vector<4xindex>
+        }
+      ],
+      Specializes = "TileAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dtype = "bfloat16",
+        config.i_dim_c = 72 : ui32,
+        config.i_dim_h = 1 : ui32,
+        config.i_dim_n = 1 : ui32,
+        config.i_dim_w = 1 : ui32,
+        config.rep_dim_c = 1 : ui32,
+        config.rep_dim_h = 23 : ui32,
+        config.rep_dim_w = 40 : ui32
+      }} {
+      %461 = tosa.tile %arg5 {multiples = array<i64: 1, 1, 23, 40>} : (tensor<1x72x1x1xbf16>) -> tensor<1x72x23x40xbf16> loc(#loc44)
+      xten_nn.output %461 : tensor<1x72x23x40xbf16> loc(#loc44)
+    } -> tensor<1x72x23x40xbf16> loc(#loc44)
+    %192 = xten_nn.subgraph (%arg5 = %191: tensor<1x72x23x40xbf16>, %arg6 = %183: tensor<1x72x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_56",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_56",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x72x23x40xbf16>, %arg8 = %arg6: tensor<1x72x23x40xbf16>)  attributes {
+        LayerName = "Mul_56",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 23, 40]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 23, 40]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_56",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_56",
+          OutputName = "Mul_56",
+          shift = 0 : i8} : (tensor<1x72x23x40xbf16>, tensor<1x72x23x40xbf16>) -> tensor<1x72x23x40xbf16> loc(#loc44)
+        xten_nn.output %462 : tensor<1x72x23x40xbf16> loc(#loc44)
+      } -> tensor<1x72x23x40xbf16> loc(#loc44)
+      xten_nn.output %461 : tensor<1x72x23x40xbf16> loc(#loc44)
+    } -> tensor<1x72x23x40xbf16> loc(#loc44)
+    %193 = xten_nn.subgraph (%arg5 = %192: tensor<1x72x23x40xbf16>, %arg6 = %133: tensor<40x72x1x1xbf16>, %arg7 = %132: tensor<40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_57",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 72, 23, 40]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[40, 72, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_57",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x72x23x40xbf16>, %arg9 = %arg6: tensor<40x72x1x1xbf16>, %arg10 = %arg7: tensor<40xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_57",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 72, 23, 40]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[40, 72, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_57",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc45)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 40, 1, 1, 72>} : (tensor<40x72x1x1xbf16>) -> tensor<40x1x1x72xbf16> loc(#loc45)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x72x23x40xbf16>, tensor<4xi32>) -> tensor<1x23x40x72xbf16> loc(#loc45)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_57",
+          PartOfOutputName = "Conv_57",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x23x40x72xbf16>, tensor<40x1x1x72xbf16>, tensor<40xbf16>) -> tensor<1x23x40x40xbf16> loc(#loc45)
+        %467 = tosa.transpose %466, %462 : (tensor<1x23x40x40xbf16>, tensor<4xi32>) -> tensor<1x40x23x40xbf16> loc(#loc45)
+        xten_nn.output %467 : tensor<1x40x23x40xbf16> loc(#loc45)
+      } -> tensor<1x40x23x40xbf16> loc(#loc45)
+      xten_nn.output %461 : tensor<1x40x23x40xbf16> loc(#loc45)
+    } -> tensor<1x40x23x40xbf16> loc(#loc45)
+    %194 = xten_nn.subgraph (%arg5 = %193: tensor<1x40x23x40xbf16>, %arg6 = %131: tensor<120x40x1x1xbf16>, %arg7 = %130: tensor<120xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_58",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[120, 40, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_59",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x40x23x40xbf16>, %arg9 = %arg6: tensor<120x40x1x1xbf16>, %arg10 = %arg7: tensor<120xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_58",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[120, 40, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_59",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc322)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 120, 1, 1, 40>} : (tensor<120x40x1x1xbf16>) -> tensor<120x1x1x40xbf16> loc(#loc322)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x40x23x40xbf16>, tensor<4xi32>) -> tensor<1x23x40x40xbf16> loc(#loc322)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_58",
+          PartOfOutputName = "Conv_58",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x23x40x40xbf16>, tensor<120x1x1x40xbf16>, tensor<120xbf16>) -> tensor<1x23x40x120xbf16> loc(#loc46)
+        %467 = tosa.clamp %466 {
+          LayerName = "Relu_59",
+          OutputName = "Relu_59",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x23x40x120xbf16>) -> tensor<1x23x40x120xbf16> loc(#loc47)
+        %468 = tosa.transpose %467, %462 : (tensor<1x23x40x120xbf16>, tensor<4xi32>) -> tensor<1x120x23x40xbf16> loc(#loc322)
+        xten_nn.output %468 : tensor<1x120x23x40xbf16> loc(#loc47)
+      } -> tensor<1x120x23x40xbf16> loc(#loc322)
+      xten_nn.output %461 : tensor<1x120x23x40xbf16> loc(#loc322)
+    } -> tensor<1x120x23x40xbf16> loc(#loc322)
+    %195 = xten_nn.subgraph (%arg5 = %194: tensor<1x120x23x40xbf16>, %arg6 = %129: tensor<120x1x5x5xbf16>, %arg7 = %128: tensor<120xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_60",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "CMHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[120, 1, 5, 5]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_61",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x120x23x40xbf16>, %arg9 = %arg6: tensor<120x1x5x5xbf16>, %arg10 = %arg7: tensor<120xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[2, 2], [2, 2]],
+        LayerName = "Conv_60",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "CMHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[120, 1, 5, 5]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_61",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "DepthwiseConv2dBf16",
+        Traits = {
+          NonNegativeOut = true
+        },
+        With = {
+          config.act = 1 : ui8,
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.kernel_height = 5 : ui8,
+          config.kernel_width = 5 : ui8,
+          config.stride = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[2, 3, 0, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc323)
+        %465 = tosa.transpose %arg9, %464 : (tensor<120x1x5x5xbf16>, tensor<4xi32>) -> tensor<5x5x120x1xbf16> loc(#loc323)
+        %466 = tosa.transpose %arg8, %463 : (tensor<1x120x23x40xbf16>, tensor<4xi32>) -> tensor<1x23x40x120xbf16> loc(#loc323)
+        %467 = tosa.depthwise_conv2d %466, %465, %arg10 {
+          PartOfLayerName = "Conv_60",
+          PartOfOutputName = "Conv_60",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 2, 2, 2, 2>,
+          stride = array<i64: 1, 1>} : (tensor<1x23x40x120xbf16>, tensor<5x5x120x1xbf16>, tensor<120xbf16>) -> tensor<1x23x40x120xbf16> loc(#loc48)
+        %468 = tosa.clamp %467 {
+          LayerName = "Relu_61",
+          OutputName = "Relu_61",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x23x40x120xbf16>) -> tensor<1x23x40x120xbf16> loc(#loc49)
+        %469 = tosa.transpose %468, %462 : (tensor<1x23x40x120xbf16>, tensor<4xi32>) -> tensor<1x120x23x40xbf16> loc(#loc323)
+        xten_nn.output %469 : tensor<1x120x23x40xbf16> loc(#loc49)
+      } -> tensor<1x120x23x40xbf16> loc(#loc323)
+      xten_nn.output %461 : tensor<1x120x23x40xbf16> loc(#loc323)
+    } -> tensor<1x120x23x40xbf16> loc(#loc323)
+    %196 = xten_nn.subgraph (%arg5 = %195: tensor<1x120x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#12",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#13",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 920]> : vector<4xindex>
+        }
+      ],
+      Specializes = "Transpose4dAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dim_0 = 23 : ui32,
+        config.dim_1 = 15 : ui32,
+        config.dim_2 = 40 : ui32,
+        config.dim_3 = 8 : ui32,
+        config.dtype = "bfloat16",
+        config.perm = 6 : ui32
+      }} {
+      %461 = tosa.reshape %arg5 {new_shape = array<i64: 1, 120, 1, 920>} : (tensor<1x120x23x40xbf16>) -> tensor<1x120x1x920xbf16> loc(#loc50)
+      xten_nn.output %461 : tensor<1x120x1x920xbf16> loc(#loc50)
+    } -> tensor<1x120x1x920xbf16> loc(#loc50)
+    %197 = xten_nn.subgraph (%arg5 = %196: tensor<1x120x1x920xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#14",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 920]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#15",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x120x1x920xbf16>)  attributes {
+        LayerName = "Generated-#14",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 920]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Generated-#15",
+        PadValue = 0.000000e+00 : bf16,
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ReduceMeanC8Bf16",
+        Traits = {
+          Reduce = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.full_channel = 120 : ui32,
+          config.full_height = 1 : ui32,
+          config.full_width = 920 : ui32,
+          config.reduce_dim = "W"
+        }} {
+        %462 = xten_nn.reduce_mean %arg6 {axes = array<i64: 3>, keepdims = 1 : i64} : (tensor<1x120x1x920xbf16>) -> tensor<1x120x1x1xbf16> loc(#loc50)
+        xten_nn.output %462 : tensor<1x120x1x1xbf16> loc(#loc50)
+      } -> tensor<1x120x1x1xbf16> loc(#loc50)
+      xten_nn.output %461 : tensor<1x120x1x1xbf16> loc(#loc50)
+    } -> tensor<1x120x1x1xbf16> loc(#loc50)
+    %198 = xten_nn.subgraph (%arg5 = %197: tensor<1x120x1x1xbf16>, %arg6 = %127: tensor<32x120x1x1xbf16>, %arg7 = %126: tensor<32xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_63",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[32, 120, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_64",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x120x1x1xbf16>, %arg9 = %arg6: tensor<32x120x1x1xbf16>, %arg10 = %arg7: tensor<32xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_63",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[32, 120, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_64",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 32, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 32, 1, 1, 120>} : (tensor<32x120x1x1xbf16>) -> tensor<32x1x1x120xbf16> loc(#loc324)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 120>} : (tensor<1x120x1x1xbf16>) -> tensor<1x1x1x120xbf16> loc(#loc324)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_63",
+          PartOfOutputName = "Conv_63",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x120xbf16>, tensor<32x1x1x120xbf16>, tensor<32xbf16>) -> tensor<1x1x1x32xbf16> loc(#loc51)
+        %465 = tosa.clamp %464 {
+          LayerName = "Relu_64",
+          OutputName = "Relu_64",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x1x1x32xbf16>) -> tensor<1x1x1x32xbf16> loc(#loc52)
+        %466 = tosa.reshape %465 {new_shape = array<i64: 1, 32, 1, 1>} : (tensor<1x1x1x32xbf16>) -> tensor<1x32x1x1xbf16> loc(#loc324)
+        xten_nn.output %466 : tensor<1x32x1x1xbf16> loc(#loc52)
+      } -> tensor<1x32x1x1xbf16> loc(#loc324)
+      xten_nn.output %461 : tensor<1x32x1x1xbf16> loc(#loc324)
+    } -> tensor<1x32x1x1xbf16> loc(#loc324)
+    %199 = xten_nn.subgraph (%arg5 = %198: tensor<1x32x1x1xbf16>, %arg6 = %125: tensor<120x32x1x1xbf16>, %arg7 = %124: tensor<120xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_65",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[120, 32, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_65",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x32x1x1xbf16>, %arg9 = %arg6: tensor<120x32x1x1xbf16>, %arg10 = %arg7: tensor<120xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_65",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 32, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[120, 32, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_65",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 120, 1, 1, 32>} : (tensor<120x32x1x1xbf16>) -> tensor<120x1x1x32xbf16> loc(#loc53)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 32>} : (tensor<1x32x1x1xbf16>) -> tensor<1x1x1x32xbf16> loc(#loc53)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_65",
+          PartOfOutputName = "Conv_65",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x32xbf16>, tensor<120x1x1x32xbf16>, tensor<120xbf16>) -> tensor<1x1x1x120xbf16> loc(#loc53)
+        %465 = tosa.reshape %464 {new_shape = array<i64: 1, 120, 1, 1>} : (tensor<1x1x1x120xbf16>) -> tensor<1x120x1x1xbf16> loc(#loc53)
+        xten_nn.output %465 : tensor<1x120x1x1xbf16> loc(#loc53)
+      } -> tensor<1x120x1x1xbf16> loc(#loc53)
+      xten_nn.output %461 : tensor<1x120x1x1xbf16> loc(#loc53)
+    } -> tensor<1x120x1x1xbf16> loc(#loc53)
+    %200 = xten_nn.subgraph (%arg5 = %199: tensor<1x120x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_67",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_67",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x120x1x1xbf16>)  attributes {
+        LayerName = "Add_67",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_67",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_67", OutputName = "Add_67"} : (tensor<1x120x1x1xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x120x1x1xbf16> loc(#loc54)
+        xten_nn.output %463 : tensor<1x120x1x1xbf16> loc(#loc54)
+      } -> tensor<1x120x1x1xbf16> loc(#loc54)
+      xten_nn.output %461 : tensor<1x120x1x1xbf16> loc(#loc54)
+    } -> tensor<1x120x1x1xbf16> loc(#loc54)
+    %201 = xten_nn.subgraph (%arg5 = %200: tensor<1x120x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_70",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_70",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x120x1x1xbf16>)  attributes {
+        LayerName = "Clip_70",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_70",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_70",
+          OutputName = "Clip_70",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x120x1x1xbf16>) -> tensor<1x120x1x1xbf16> loc(#loc55)
+        xten_nn.output %462 : tensor<1x120x1x1xbf16> loc(#loc55)
+      } -> tensor<1x120x1x1xbf16> loc(#loc55)
+      xten_nn.output %461 : tensor<1x120x1x1xbf16> loc(#loc55)
+    } -> tensor<1x120x1x1xbf16> loc(#loc55)
+    %202 = xten_nn.subgraph (%arg5 = %201: tensor<1x120x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_72",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_72",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x120x1x1xbf16>)  attributes {
+        LayerName = "Div_72",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_72",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_72",
+          OutputName = "Div_72",
+          shift = 0 : i8} : (tensor<1x120x1x1xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x120x1x1xbf16> loc(#loc56)
+        xten_nn.output %463 : tensor<1x120x1x1xbf16> loc(#loc56)
+      } -> tensor<1x120x1x1xbf16> loc(#loc56)
+      xten_nn.output %461 : tensor<1x120x1x1xbf16> loc(#loc56)
+    } -> tensor<1x120x1x1xbf16> loc(#loc56)
+    %203 = xten_nn.subgraph (%arg5 = %202: tensor<1x120x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#16",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#17",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        }
+      ],
+      Specializes = "TileAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dtype = "bfloat16",
+        config.i_dim_c = 120 : ui32,
+        config.i_dim_h = 1 : ui32,
+        config.i_dim_n = 1 : ui32,
+        config.i_dim_w = 1 : ui32,
+        config.rep_dim_c = 1 : ui32,
+        config.rep_dim_h = 23 : ui32,
+        config.rep_dim_w = 40 : ui32
+      }} {
+      %461 = tosa.tile %arg5 {multiples = array<i64: 1, 1, 23, 40>} : (tensor<1x120x1x1xbf16>) -> tensor<1x120x23x40xbf16> loc(#loc57)
+      xten_nn.output %461 : tensor<1x120x23x40xbf16> loc(#loc57)
+    } -> tensor<1x120x23x40xbf16> loc(#loc57)
+    %204 = xten_nn.subgraph (%arg5 = %203: tensor<1x120x23x40xbf16>, %arg6 = %195: tensor<1x120x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_73",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_73",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x120x23x40xbf16>, %arg8 = %arg6: tensor<1x120x23x40xbf16>)  attributes {
+        LayerName = "Mul_73",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_73",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_73",
+          OutputName = "Mul_73",
+          shift = 0 : i8} : (tensor<1x120x23x40xbf16>, tensor<1x120x23x40xbf16>) -> tensor<1x120x23x40xbf16> loc(#loc57)
+        xten_nn.output %462 : tensor<1x120x23x40xbf16> loc(#loc57)
+      } -> tensor<1x120x23x40xbf16> loc(#loc57)
+      xten_nn.output %461 : tensor<1x120x23x40xbf16> loc(#loc57)
+    } -> tensor<1x120x23x40xbf16> loc(#loc57)
+    %205 = xten_nn.subgraph (%arg5 = %204: tensor<1x120x23x40xbf16>, %arg6 = %123: tensor<40x120x1x1xbf16>, %arg7 = %122: tensor<40xbf16>, %arg8 = %193: tensor<1x40x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index, 3 : index],
+      LayerName = "Conv_74",
+      OfmShare = 3 : index,
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[40, 120, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_75",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg9 = %arg5: tensor<1x120x23x40xbf16>, %arg10 = %arg6: tensor<40x120x1x1xbf16>, %arg11 = %arg7: tensor<40xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_74",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[40, 120, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_74",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %463 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc58)
+        %465 = tosa.reshape %arg10 {new_shape = array<i64: 40, 1, 1, 120>} : (tensor<40x120x1x1xbf16>) -> tensor<40x1x1x120xbf16> loc(#loc58)
+        %466 = tosa.transpose %arg9, %464 : (tensor<1x120x23x40xbf16>, tensor<4xi32>) -> tensor<1x23x40x120xbf16> loc(#loc58)
+        %467 = tosa.conv2d %466, %465, %arg11 {
+          PartOfLayerName = "Conv_74",
+          PartOfOutputName = "Conv_74",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x23x40x120xbf16>, tensor<40x1x1x120xbf16>, tensor<40xbf16>) -> tensor<1x23x40x40xbf16> loc(#loc58)
+        %468 = tosa.transpose %467, %463 : (tensor<1x23x40x40xbf16>, tensor<4xi32>) -> tensor<1x40x23x40xbf16> loc(#loc58)
+        xten_nn.output %468 : tensor<1x40x23x40xbf16> loc(#loc58)
+      } -> tensor<1x40x23x40xbf16> loc(#loc58)
+      %462 = xten_nn.subgraph (%arg9 = %461: tensor<1x40x23x40xbf16>, %arg10 = %arg8: tensor<1x40x23x40xbf16>)  attributes {
+        LayerName = "Add_75",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_75",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.act = 0 : ui8,
+          config.act_type = "LINEAR",
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %463 = tosa.add %arg9, %arg10 {LayerName = "Add_75", OutputName = "Add_75"} : (tensor<1x40x23x40xbf16>, tensor<1x40x23x40xbf16>) -> tensor<1x40x23x40xbf16> loc(#loc59)
+        xten_nn.output %463 : tensor<1x40x23x40xbf16> loc(#loc59)
+      } -> tensor<1x40x23x40xbf16> loc(#loc59)
+      xten_nn.output %462 : tensor<1x40x23x40xbf16> loc(#loc59)
+    } -> tensor<1x40x23x40xbf16> loc(#loc325)
+    %206 = xten_nn.subgraph (%arg5 = %205: tensor<1x40x23x40xbf16>, %arg6 = %121: tensor<120x40x1x1xbf16>, %arg7 = %120: tensor<120xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_76",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[120, 40, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_77",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x40x23x40xbf16>, %arg9 = %arg6: tensor<120x40x1x1xbf16>, %arg10 = %arg7: tensor<120xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_76",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[120, 40, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_77",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc326)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 120, 1, 1, 40>} : (tensor<120x40x1x1xbf16>) -> tensor<120x1x1x40xbf16> loc(#loc326)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x40x23x40xbf16>, tensor<4xi32>) -> tensor<1x23x40x40xbf16> loc(#loc326)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_76",
+          PartOfOutputName = "Conv_76",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x23x40x40xbf16>, tensor<120x1x1x40xbf16>, tensor<120xbf16>) -> tensor<1x23x40x120xbf16> loc(#loc60)
+        %467 = tosa.clamp %466 {
+          LayerName = "Relu_77",
+          OutputName = "Relu_77",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x23x40x120xbf16>) -> tensor<1x23x40x120xbf16> loc(#loc61)
+        %468 = tosa.transpose %467, %462 : (tensor<1x23x40x120xbf16>, tensor<4xi32>) -> tensor<1x120x23x40xbf16> loc(#loc326)
+        xten_nn.output %468 : tensor<1x120x23x40xbf16> loc(#loc61)
+      } -> tensor<1x120x23x40xbf16> loc(#loc326)
+      xten_nn.output %461 : tensor<1x120x23x40xbf16> loc(#loc326)
+    } -> tensor<1x120x23x40xbf16> loc(#loc326)
+    %207 = xten_nn.subgraph (%arg5 = %206: tensor<1x120x23x40xbf16>, %arg6 = %119: tensor<120x1x5x5xbf16>, %arg7 = %118: tensor<120xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_78",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "CMHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[120, 1, 5, 5]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_79",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x120x23x40xbf16>, %arg9 = %arg6: tensor<120x1x5x5xbf16>, %arg10 = %arg7: tensor<120xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[2, 2], [2, 2]],
+        LayerName = "Conv_78",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "CMHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[120, 1, 5, 5]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_79",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "DepthwiseConv2dBf16",
+        Traits = {
+          NonNegativeOut = true
+        },
+        With = {
+          config.act = 1 : ui8,
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.kernel_height = 5 : ui8,
+          config.kernel_width = 5 : ui8,
+          config.stride = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[2, 3, 0, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc327)
+        %465 = tosa.transpose %arg9, %464 : (tensor<120x1x5x5xbf16>, tensor<4xi32>) -> tensor<5x5x120x1xbf16> loc(#loc327)
+        %466 = tosa.transpose %arg8, %463 : (tensor<1x120x23x40xbf16>, tensor<4xi32>) -> tensor<1x23x40x120xbf16> loc(#loc327)
+        %467 = tosa.depthwise_conv2d %466, %465, %arg10 {
+          PartOfLayerName = "Conv_78",
+          PartOfOutputName = "Conv_78",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 2, 2, 2, 2>,
+          stride = array<i64: 1, 1>} : (tensor<1x23x40x120xbf16>, tensor<5x5x120x1xbf16>, tensor<120xbf16>) -> tensor<1x23x40x120xbf16> loc(#loc62)
+        %468 = tosa.clamp %467 {
+          LayerName = "Relu_79",
+          OutputName = "Relu_79",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x23x40x120xbf16>) -> tensor<1x23x40x120xbf16> loc(#loc63)
+        %469 = tosa.transpose %468, %462 : (tensor<1x23x40x120xbf16>, tensor<4xi32>) -> tensor<1x120x23x40xbf16> loc(#loc327)
+        xten_nn.output %469 : tensor<1x120x23x40xbf16> loc(#loc63)
+      } -> tensor<1x120x23x40xbf16> loc(#loc327)
+      xten_nn.output %461 : tensor<1x120x23x40xbf16> loc(#loc327)
+    } -> tensor<1x120x23x40xbf16> loc(#loc327)
+    %208 = xten_nn.subgraph (%arg5 = %207: tensor<1x120x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#18",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#19",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 920]> : vector<4xindex>
+        }
+      ],
+      Specializes = "Transpose4dAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dim_0 = 23 : ui32,
+        config.dim_1 = 15 : ui32,
+        config.dim_2 = 40 : ui32,
+        config.dim_3 = 8 : ui32,
+        config.dtype = "bfloat16",
+        config.perm = 6 : ui32
+      }} {
+      %461 = tosa.reshape %arg5 {new_shape = array<i64: 1, 120, 1, 920>} : (tensor<1x120x23x40xbf16>) -> tensor<1x120x1x920xbf16> loc(#loc64)
+      xten_nn.output %461 : tensor<1x120x1x920xbf16> loc(#loc64)
+    } -> tensor<1x120x1x920xbf16> loc(#loc64)
+    %209 = xten_nn.subgraph (%arg5 = %208: tensor<1x120x1x920xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#20",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 920]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#21",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x120x1x920xbf16>)  attributes {
+        LayerName = "Generated-#20",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 920]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Generated-#21",
+        PadValue = 0.000000e+00 : bf16,
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ReduceMeanC8Bf16",
+        Traits = {
+          Reduce = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.full_channel = 120 : ui32,
+          config.full_height = 1 : ui32,
+          config.full_width = 920 : ui32,
+          config.reduce_dim = "W"
+        }} {
+        %462 = xten_nn.reduce_mean %arg6 {axes = array<i64: 3>, keepdims = 1 : i64} : (tensor<1x120x1x920xbf16>) -> tensor<1x120x1x1xbf16> loc(#loc64)
+        xten_nn.output %462 : tensor<1x120x1x1xbf16> loc(#loc64)
+      } -> tensor<1x120x1x1xbf16> loc(#loc64)
+      xten_nn.output %461 : tensor<1x120x1x1xbf16> loc(#loc64)
+    } -> tensor<1x120x1x1xbf16> loc(#loc64)
+    %210 = xten_nn.subgraph (%arg5 = %209: tensor<1x120x1x1xbf16>, %arg6 = %117: tensor<32x120x1x1xbf16>, %arg7 = %116: tensor<32xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_81",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[32, 120, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_82",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x120x1x1xbf16>, %arg9 = %arg6: tensor<32x120x1x1xbf16>, %arg10 = %arg7: tensor<32xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_81",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[32, 120, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_82",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 32, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 32, 1, 1, 120>} : (tensor<32x120x1x1xbf16>) -> tensor<32x1x1x120xbf16> loc(#loc328)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 120>} : (tensor<1x120x1x1xbf16>) -> tensor<1x1x1x120xbf16> loc(#loc328)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_81",
+          PartOfOutputName = "Conv_81",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x120xbf16>, tensor<32x1x1x120xbf16>, tensor<32xbf16>) -> tensor<1x1x1x32xbf16> loc(#loc65)
+        %465 = tosa.clamp %464 {
+          LayerName = "Relu_82",
+          OutputName = "Relu_82",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x1x1x32xbf16>) -> tensor<1x1x1x32xbf16> loc(#loc66)
+        %466 = tosa.reshape %465 {new_shape = array<i64: 1, 32, 1, 1>} : (tensor<1x1x1x32xbf16>) -> tensor<1x32x1x1xbf16> loc(#loc328)
+        xten_nn.output %466 : tensor<1x32x1x1xbf16> loc(#loc66)
+      } -> tensor<1x32x1x1xbf16> loc(#loc328)
+      xten_nn.output %461 : tensor<1x32x1x1xbf16> loc(#loc328)
+    } -> tensor<1x32x1x1xbf16> loc(#loc328)
+    %211 = xten_nn.subgraph (%arg5 = %210: tensor<1x32x1x1xbf16>, %arg6 = %115: tensor<120x32x1x1xbf16>, %arg7 = %114: tensor<120xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_83",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[120, 32, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_83",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x32x1x1xbf16>, %arg9 = %arg6: tensor<120x32x1x1xbf16>, %arg10 = %arg7: tensor<120xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_83",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 32, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[120, 32, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_83",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 120, 1, 1, 32>} : (tensor<120x32x1x1xbf16>) -> tensor<120x1x1x32xbf16> loc(#loc67)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 32>} : (tensor<1x32x1x1xbf16>) -> tensor<1x1x1x32xbf16> loc(#loc67)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_83",
+          PartOfOutputName = "Conv_83",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x32xbf16>, tensor<120x1x1x32xbf16>, tensor<120xbf16>) -> tensor<1x1x1x120xbf16> loc(#loc67)
+        %465 = tosa.reshape %464 {new_shape = array<i64: 1, 120, 1, 1>} : (tensor<1x1x1x120xbf16>) -> tensor<1x120x1x1xbf16> loc(#loc67)
+        xten_nn.output %465 : tensor<1x120x1x1xbf16> loc(#loc67)
+      } -> tensor<1x120x1x1xbf16> loc(#loc67)
+      xten_nn.output %461 : tensor<1x120x1x1xbf16> loc(#loc67)
+    } -> tensor<1x120x1x1xbf16> loc(#loc67)
+    %212 = xten_nn.subgraph (%arg5 = %211: tensor<1x120x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_85",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_85",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x120x1x1xbf16>)  attributes {
+        LayerName = "Add_85",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_85",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_85", OutputName = "Add_85"} : (tensor<1x120x1x1xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x120x1x1xbf16> loc(#loc68)
+        xten_nn.output %463 : tensor<1x120x1x1xbf16> loc(#loc68)
+      } -> tensor<1x120x1x1xbf16> loc(#loc68)
+      xten_nn.output %461 : tensor<1x120x1x1xbf16> loc(#loc68)
+    } -> tensor<1x120x1x1xbf16> loc(#loc68)
+    %213 = xten_nn.subgraph (%arg5 = %212: tensor<1x120x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_88",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_88",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x120x1x1xbf16>)  attributes {
+        LayerName = "Clip_88",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_88",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_88",
+          OutputName = "Clip_88",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x120x1x1xbf16>) -> tensor<1x120x1x1xbf16> loc(#loc69)
+        xten_nn.output %462 : tensor<1x120x1x1xbf16> loc(#loc69)
+      } -> tensor<1x120x1x1xbf16> loc(#loc69)
+      xten_nn.output %461 : tensor<1x120x1x1xbf16> loc(#loc69)
+    } -> tensor<1x120x1x1xbf16> loc(#loc69)
+    %214 = xten_nn.subgraph (%arg5 = %213: tensor<1x120x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_90",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_90",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x120x1x1xbf16>)  attributes {
+        LayerName = "Div_90",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_90",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_90",
+          OutputName = "Div_90",
+          shift = 0 : i8} : (tensor<1x120x1x1xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x120x1x1xbf16> loc(#loc70)
+        xten_nn.output %463 : tensor<1x120x1x1xbf16> loc(#loc70)
+      } -> tensor<1x120x1x1xbf16> loc(#loc70)
+      xten_nn.output %461 : tensor<1x120x1x1xbf16> loc(#loc70)
+    } -> tensor<1x120x1x1xbf16> loc(#loc70)
+    %215 = xten_nn.subgraph (%arg5 = %214: tensor<1x120x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#22",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#23",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        }
+      ],
+      Specializes = "TileAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dtype = "bfloat16",
+        config.i_dim_c = 120 : ui32,
+        config.i_dim_h = 1 : ui32,
+        config.i_dim_n = 1 : ui32,
+        config.i_dim_w = 1 : ui32,
+        config.rep_dim_c = 1 : ui32,
+        config.rep_dim_h = 23 : ui32,
+        config.rep_dim_w = 40 : ui32
+      }} {
+      %461 = tosa.tile %arg5 {multiples = array<i64: 1, 1, 23, 40>} : (tensor<1x120x1x1xbf16>) -> tensor<1x120x23x40xbf16> loc(#loc71)
+      xten_nn.output %461 : tensor<1x120x23x40xbf16> loc(#loc71)
+    } -> tensor<1x120x23x40xbf16> loc(#loc71)
+    %216 = xten_nn.subgraph (%arg5 = %215: tensor<1x120x23x40xbf16>, %arg6 = %207: tensor<1x120x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_91",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_91",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x120x23x40xbf16>, %arg8 = %arg6: tensor<1x120x23x40xbf16>)  attributes {
+        LayerName = "Mul_91",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_91",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_91",
+          OutputName = "Mul_91",
+          shift = 0 : i8} : (tensor<1x120x23x40xbf16>, tensor<1x120x23x40xbf16>) -> tensor<1x120x23x40xbf16> loc(#loc71)
+        xten_nn.output %462 : tensor<1x120x23x40xbf16> loc(#loc71)
+      } -> tensor<1x120x23x40xbf16> loc(#loc71)
+      xten_nn.output %461 : tensor<1x120x23x40xbf16> loc(#loc71)
+    } -> tensor<1x120x23x40xbf16> loc(#loc71)
+    %217 = xten_nn.subgraph (%arg5 = %216: tensor<1x120x23x40xbf16>, %arg6 = %113: tensor<40x120x1x1xbf16>, %arg7 = %112: tensor<40xbf16>, %arg8 = %205: tensor<1x40x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index, 3 : index],
+      LayerName = "Conv_92",
+      OfmShare = 3 : index,
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[40, 120, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_93",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg9 = %arg5: tensor<1x120x23x40xbf16>, %arg10 = %arg6: tensor<40x120x1x1xbf16>, %arg11 = %arg7: tensor<40xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_92",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 23, 40]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[40, 120, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_92",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %463 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc72)
+        %465 = tosa.reshape %arg10 {new_shape = array<i64: 40, 1, 1, 120>} : (tensor<40x120x1x1xbf16>) -> tensor<40x1x1x120xbf16> loc(#loc72)
+        %466 = tosa.transpose %arg9, %464 : (tensor<1x120x23x40xbf16>, tensor<4xi32>) -> tensor<1x23x40x120xbf16> loc(#loc72)
+        %467 = tosa.conv2d %466, %465, %arg11 {
+          PartOfLayerName = "Conv_92",
+          PartOfOutputName = "Conv_92",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x23x40x120xbf16>, tensor<40x1x1x120xbf16>, tensor<40xbf16>) -> tensor<1x23x40x40xbf16> loc(#loc72)
+        %468 = tosa.transpose %467, %463 : (tensor<1x23x40x40xbf16>, tensor<4xi32>) -> tensor<1x40x23x40xbf16> loc(#loc72)
+        xten_nn.output %468 : tensor<1x40x23x40xbf16> loc(#loc72)
+      } -> tensor<1x40x23x40xbf16> loc(#loc72)
+      %462 = xten_nn.subgraph (%arg9 = %461: tensor<1x40x23x40xbf16>, %arg10 = %arg8: tensor<1x40x23x40xbf16>)  attributes {
+        LayerName = "Add_93",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_93",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.act = 0 : ui8,
+          config.act_type = "LINEAR",
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %463 = tosa.add %arg9, %arg10 {LayerName = "Add_93", OutputName = "Add_93"} : (tensor<1x40x23x40xbf16>, tensor<1x40x23x40xbf16>) -> tensor<1x40x23x40xbf16> loc(#loc73)
+        xten_nn.output %463 : tensor<1x40x23x40xbf16> loc(#loc73)
+      } -> tensor<1x40x23x40xbf16> loc(#loc73)
+      xten_nn.output %462 : tensor<1x40x23x40xbf16> loc(#loc73)
+    } -> tensor<1x40x23x40xbf16> loc(#loc329)
+    %218 = xten_nn.subgraph (%arg5 = %217: tensor<1x40x23x40xbf16>, %arg6 = %111: tensor<240x40x1x1xbf16>, %arg7 = %110: tensor<240xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_94",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[240, 40, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_94",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x40x23x40xbf16>, %arg9 = %arg6: tensor<240x40x1x1xbf16>, %arg10 = %arg7: tensor<240xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_94",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[240, 40, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_94",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc74)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 240, 1, 1, 40>} : (tensor<240x40x1x1xbf16>) -> tensor<240x1x1x40xbf16> loc(#loc74)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x40x23x40xbf16>, tensor<4xi32>) -> tensor<1x23x40x40xbf16> loc(#loc74)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_94",
+          PartOfOutputName = "Conv_94",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x23x40x40xbf16>, tensor<240x1x1x40xbf16>, tensor<240xbf16>) -> tensor<1x23x40x240xbf16> loc(#loc74)
+        %467 = tosa.transpose %466, %462 : (tensor<1x23x40x240xbf16>, tensor<4xi32>) -> tensor<1x240x23x40xbf16> loc(#loc74)
+        xten_nn.output %467 : tensor<1x240x23x40xbf16> loc(#loc74)
+      } -> tensor<1x240x23x40xbf16> loc(#loc74)
+      xten_nn.output %461 : tensor<1x240x23x40xbf16> loc(#loc74)
+    } -> tensor<1x240x23x40xbf16> loc(#loc74)
+    %219 = xten_nn.subgraph (%arg5 = %218: tensor<1x240x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_96",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_96",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x240x23x40xbf16>)  attributes {
+        LayerName = "Add_96",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_96",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_96", OutputName = "Add_96"} : (tensor<1x240x23x40xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x240x23x40xbf16> loc(#loc75)
+        xten_nn.output %463 : tensor<1x240x23x40xbf16> loc(#loc75)
+      } -> tensor<1x240x23x40xbf16> loc(#loc75)
+      xten_nn.output %461 : tensor<1x240x23x40xbf16> loc(#loc75)
+    } -> tensor<1x240x23x40xbf16> loc(#loc75)
+    %220 = xten_nn.subgraph (%arg5 = %219: tensor<1x240x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_99",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_99",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x240x23x40xbf16>)  attributes {
+        LayerName = "Clip_99",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_99",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_99",
+          OutputName = "Clip_99",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x240x23x40xbf16>) -> tensor<1x240x23x40xbf16> loc(#loc76)
+        xten_nn.output %462 : tensor<1x240x23x40xbf16> loc(#loc76)
+      } -> tensor<1x240x23x40xbf16> loc(#loc76)
+      xten_nn.output %461 : tensor<1x240x23x40xbf16> loc(#loc76)
+    } -> tensor<1x240x23x40xbf16> loc(#loc76)
+    %221 = xten_nn.subgraph (%arg5 = %220: tensor<1x240x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_101",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_101",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x240x23x40xbf16>)  attributes {
+        LayerName = "Div_101",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_101",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_101",
+          OutputName = "Div_101",
+          shift = 0 : i8} : (tensor<1x240x23x40xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x240x23x40xbf16> loc(#loc77)
+        xten_nn.output %463 : tensor<1x240x23x40xbf16> loc(#loc77)
+      } -> tensor<1x240x23x40xbf16> loc(#loc77)
+      xten_nn.output %461 : tensor<1x240x23x40xbf16> loc(#loc77)
+    } -> tensor<1x240x23x40xbf16> loc(#loc77)
+    %222 = xten_nn.subgraph (%arg5 = %218: tensor<1x240x23x40xbf16>, %arg6 = %221: tensor<1x240x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_102",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_102",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x240x23x40xbf16>, %arg8 = %arg6: tensor<1x240x23x40xbf16>)  attributes {
+        LayerName = "Mul_102",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_102",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_102",
+          OutputName = "Mul_102",
+          shift = 0 : i8} : (tensor<1x240x23x40xbf16>, tensor<1x240x23x40xbf16>) -> tensor<1x240x23x40xbf16> loc(#loc78)
+        xten_nn.output %462 : tensor<1x240x23x40xbf16> loc(#loc78)
+      } -> tensor<1x240x23x40xbf16> loc(#loc78)
+      xten_nn.output %461 : tensor<1x240x23x40xbf16> loc(#loc78)
+    } -> tensor<1x240x23x40xbf16> loc(#loc78)
+    %223 = xten_nn.subgraph (%arg5 = %222: tensor<1x240x23x40xbf16>, %arg6 = %109: tensor<240x1x3x3xbf16>, %arg7 = %108: tensor<240xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_103",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "CMHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[240, 1, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_103",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x240x23x40xbf16>, %arg9 = %arg6: tensor<240x1x3x3xbf16>, %arg10 = %arg7: tensor<240xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 0]],
+        LayerName = "Conv_103",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 23, 40]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "CMHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[240, 1, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_103",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "DepthwiseConv2dBf16",
+        With = {
+          config.act = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.kernel_height = 3 : ui8,
+          config.kernel_width = 3 : ui8,
+          config.stride = 2 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[2, 3, 0, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc79)
+        %465 = tosa.transpose %arg9, %464 : (tensor<240x1x3x3xbf16>, tensor<4xi32>) -> tensor<3x3x240x1xbf16> loc(#loc79)
+        %466 = tosa.transpose %arg8, %463 : (tensor<1x240x23x40xbf16>, tensor<4xi32>) -> tensor<1x23x40x240xbf16> loc(#loc79)
+        %467 = tosa.depthwise_conv2d %466, %465, %arg10 {
+          PartOfLayerName = "Conv_103",
+          PartOfOutputName = "Conv_103",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 0>,
+          stride = array<i64: 2, 2>} : (tensor<1x23x40x240xbf16>, tensor<3x3x240x1xbf16>, tensor<240xbf16>) -> tensor<1x12x20x240xbf16> loc(#loc79)
+        %468 = tosa.transpose %467, %462 : (tensor<1x12x20x240xbf16>, tensor<4xi32>) -> tensor<1x240x12x20xbf16> loc(#loc79)
+        xten_nn.output %468 : tensor<1x240x12x20xbf16> loc(#loc79)
+      } -> tensor<1x240x12x20xbf16> loc(#loc79)
+      xten_nn.output %461 : tensor<1x240x12x20xbf16> loc(#loc79)
+    } -> tensor<1x240x12x20xbf16> loc(#loc79)
+    %224 = xten_nn.subgraph (%arg5 = %223: tensor<1x240x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_105",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_105",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x240x12x20xbf16>)  attributes {
+        LayerName = "Add_105",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_105",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_105", OutputName = "Add_105"} : (tensor<1x240x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x240x12x20xbf16> loc(#loc80)
+        xten_nn.output %463 : tensor<1x240x12x20xbf16> loc(#loc80)
+      } -> tensor<1x240x12x20xbf16> loc(#loc80)
+      xten_nn.output %461 : tensor<1x240x12x20xbf16> loc(#loc80)
+    } -> tensor<1x240x12x20xbf16> loc(#loc80)
+    %225 = xten_nn.subgraph (%arg5 = %224: tensor<1x240x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_108",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_108",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x240x12x20xbf16>)  attributes {
+        LayerName = "Clip_108",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_108",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_108",
+          OutputName = "Clip_108",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x240x12x20xbf16>) -> tensor<1x240x12x20xbf16> loc(#loc81)
+        xten_nn.output %462 : tensor<1x240x12x20xbf16> loc(#loc81)
+      } -> tensor<1x240x12x20xbf16> loc(#loc81)
+      xten_nn.output %461 : tensor<1x240x12x20xbf16> loc(#loc81)
+    } -> tensor<1x240x12x20xbf16> loc(#loc81)
+    %226 = xten_nn.subgraph (%arg5 = %225: tensor<1x240x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_110",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_110",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x240x12x20xbf16>)  attributes {
+        LayerName = "Div_110",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_110",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_110",
+          OutputName = "Div_110",
+          shift = 0 : i8} : (tensor<1x240x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x240x12x20xbf16> loc(#loc82)
+        xten_nn.output %463 : tensor<1x240x12x20xbf16> loc(#loc82)
+      } -> tensor<1x240x12x20xbf16> loc(#loc82)
+      xten_nn.output %461 : tensor<1x240x12x20xbf16> loc(#loc82)
+    } -> tensor<1x240x12x20xbf16> loc(#loc82)
+    %227 = xten_nn.subgraph (%arg5 = %223: tensor<1x240x12x20xbf16>, %arg6 = %226: tensor<1x240x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_111",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_111",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x240x12x20xbf16>, %arg8 = %arg6: tensor<1x240x12x20xbf16>)  attributes {
+        LayerName = "Mul_111",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_111",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_111",
+          OutputName = "Mul_111",
+          shift = 0 : i8} : (tensor<1x240x12x20xbf16>, tensor<1x240x12x20xbf16>) -> tensor<1x240x12x20xbf16> loc(#loc83)
+        xten_nn.output %462 : tensor<1x240x12x20xbf16> loc(#loc83)
+      } -> tensor<1x240x12x20xbf16> loc(#loc83)
+      xten_nn.output %461 : tensor<1x240x12x20xbf16> loc(#loc83)
+    } -> tensor<1x240x12x20xbf16> loc(#loc83)
+    %228 = xten_nn.subgraph (%arg5 = %227: tensor<1x240x12x20xbf16>, %arg6 = %107: tensor<80x240x1x1xbf16>, %arg7 = %106: tensor<80xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_112",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[80, 240, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_112",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x240x12x20xbf16>, %arg9 = %arg6: tensor<80x240x1x1xbf16>, %arg10 = %arg7: tensor<80xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_112",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[80, 240, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_112",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc84)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 80, 1, 1, 240>} : (tensor<80x240x1x1xbf16>) -> tensor<80x1x1x240xbf16> loc(#loc84)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x240x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x240xbf16> loc(#loc84)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_112",
+          PartOfOutputName = "Conv_112",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x240xbf16>, tensor<80x1x1x240xbf16>, tensor<80xbf16>) -> tensor<1x12x20x80xbf16> loc(#loc84)
+        %467 = tosa.transpose %466, %462 : (tensor<1x12x20x80xbf16>, tensor<4xi32>) -> tensor<1x80x12x20xbf16> loc(#loc84)
+        xten_nn.output %467 : tensor<1x80x12x20xbf16> loc(#loc84)
+      } -> tensor<1x80x12x20xbf16> loc(#loc84)
+      xten_nn.output %461 : tensor<1x80x12x20xbf16> loc(#loc84)
+    } -> tensor<1x80x12x20xbf16> loc(#loc84)
+    %229 = xten_nn.subgraph (%arg5 = %228: tensor<1x80x12x20xbf16>, %arg6 = %105: tensor<200x80x1x1xbf16>, %arg7 = %104: tensor<200xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_113",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[200, 80, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_113",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x80x12x20xbf16>, %arg9 = %arg6: tensor<200x80x1x1xbf16>, %arg10 = %arg7: tensor<200xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_113",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[200, 80, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_113",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc85)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 200, 1, 1, 80>} : (tensor<200x80x1x1xbf16>) -> tensor<200x1x1x80xbf16> loc(#loc85)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x80x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x80xbf16> loc(#loc85)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_113",
+          PartOfOutputName = "Conv_113",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x80xbf16>, tensor<200x1x1x80xbf16>, tensor<200xbf16>) -> tensor<1x12x20x200xbf16> loc(#loc85)
+        %467 = tosa.transpose %466, %462 : (tensor<1x12x20x200xbf16>, tensor<4xi32>) -> tensor<1x200x12x20xbf16> loc(#loc85)
+        xten_nn.output %467 : tensor<1x200x12x20xbf16> loc(#loc85)
+      } -> tensor<1x200x12x20xbf16> loc(#loc85)
+      xten_nn.output %461 : tensor<1x200x12x20xbf16> loc(#loc85)
+    } -> tensor<1x200x12x20xbf16> loc(#loc85)
+    %230 = xten_nn.subgraph (%arg5 = %229: tensor<1x200x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_115",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_115",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x200x12x20xbf16>)  attributes {
+        LayerName = "Add_115",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_115",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_115", OutputName = "Add_115"} : (tensor<1x200x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x200x12x20xbf16> loc(#loc86)
+        xten_nn.output %463 : tensor<1x200x12x20xbf16> loc(#loc86)
+      } -> tensor<1x200x12x20xbf16> loc(#loc86)
+      xten_nn.output %461 : tensor<1x200x12x20xbf16> loc(#loc86)
+    } -> tensor<1x200x12x20xbf16> loc(#loc86)
+    %231 = xten_nn.subgraph (%arg5 = %230: tensor<1x200x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_118",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_118",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x200x12x20xbf16>)  attributes {
+        LayerName = "Clip_118",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_118",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_118",
+          OutputName = "Clip_118",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x200x12x20xbf16>) -> tensor<1x200x12x20xbf16> loc(#loc87)
+        xten_nn.output %462 : tensor<1x200x12x20xbf16> loc(#loc87)
+      } -> tensor<1x200x12x20xbf16> loc(#loc87)
+      xten_nn.output %461 : tensor<1x200x12x20xbf16> loc(#loc87)
+    } -> tensor<1x200x12x20xbf16> loc(#loc87)
+    %232 = xten_nn.subgraph (%arg5 = %231: tensor<1x200x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_120",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_120",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x200x12x20xbf16>)  attributes {
+        LayerName = "Div_120",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_120",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_120",
+          OutputName = "Div_120",
+          shift = 0 : i8} : (tensor<1x200x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x200x12x20xbf16> loc(#loc88)
+        xten_nn.output %463 : tensor<1x200x12x20xbf16> loc(#loc88)
+      } -> tensor<1x200x12x20xbf16> loc(#loc88)
+      xten_nn.output %461 : tensor<1x200x12x20xbf16> loc(#loc88)
+    } -> tensor<1x200x12x20xbf16> loc(#loc88)
+    %233 = xten_nn.subgraph (%arg5 = %229: tensor<1x200x12x20xbf16>, %arg6 = %232: tensor<1x200x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_121",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_121",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x200x12x20xbf16>, %arg8 = %arg6: tensor<1x200x12x20xbf16>)  attributes {
+        LayerName = "Mul_121",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_121",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_121",
+          OutputName = "Mul_121",
+          shift = 0 : i8} : (tensor<1x200x12x20xbf16>, tensor<1x200x12x20xbf16>) -> tensor<1x200x12x20xbf16> loc(#loc89)
+        xten_nn.output %462 : tensor<1x200x12x20xbf16> loc(#loc89)
+      } -> tensor<1x200x12x20xbf16> loc(#loc89)
+      xten_nn.output %461 : tensor<1x200x12x20xbf16> loc(#loc89)
+    } -> tensor<1x200x12x20xbf16> loc(#loc89)
+    %234 = xten_nn.subgraph (%arg5 = %233: tensor<1x200x12x20xbf16>, %arg6 = %103: tensor<200x1x3x3xbf16>, %arg7 = %102: tensor<200xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_122",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "CMHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[200, 1, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_122",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x200x12x20xbf16>, %arg9 = %arg6: tensor<200x1x3x3xbf16>, %arg10 = %arg7: tensor<200xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_122",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "CMHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[200, 1, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_122",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "DepthwiseConv2dBf16",
+        With = {
+          config.act = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.kernel_height = 3 : ui8,
+          config.kernel_width = 3 : ui8,
+          config.stride = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[2, 3, 0, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc90)
+        %465 = tosa.transpose %arg9, %464 : (tensor<200x1x3x3xbf16>, tensor<4xi32>) -> tensor<3x3x200x1xbf16> loc(#loc90)
+        %466 = tosa.transpose %arg8, %463 : (tensor<1x200x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x200xbf16> loc(#loc90)
+        %467 = tosa.depthwise_conv2d %466, %465, %arg10 {
+          PartOfLayerName = "Conv_122",
+          PartOfOutputName = "Conv_122",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x200xbf16>, tensor<3x3x200x1xbf16>, tensor<200xbf16>) -> tensor<1x12x20x200xbf16> loc(#loc90)
+        %468 = tosa.transpose %467, %462 : (tensor<1x12x20x200xbf16>, tensor<4xi32>) -> tensor<1x200x12x20xbf16> loc(#loc90)
+        xten_nn.output %468 : tensor<1x200x12x20xbf16> loc(#loc90)
+      } -> tensor<1x200x12x20xbf16> loc(#loc90)
+      xten_nn.output %461 : tensor<1x200x12x20xbf16> loc(#loc90)
+    } -> tensor<1x200x12x20xbf16> loc(#loc90)
+    %235 = xten_nn.subgraph (%arg5 = %234: tensor<1x200x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_124",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_124",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x200x12x20xbf16>)  attributes {
+        LayerName = "Add_124",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_124",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_124", OutputName = "Add_124"} : (tensor<1x200x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x200x12x20xbf16> loc(#loc91)
+        xten_nn.output %463 : tensor<1x200x12x20xbf16> loc(#loc91)
+      } -> tensor<1x200x12x20xbf16> loc(#loc91)
+      xten_nn.output %461 : tensor<1x200x12x20xbf16> loc(#loc91)
+    } -> tensor<1x200x12x20xbf16> loc(#loc91)
+    %236 = xten_nn.subgraph (%arg5 = %235: tensor<1x200x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_127",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_127",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x200x12x20xbf16>)  attributes {
+        LayerName = "Clip_127",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_127",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_127",
+          OutputName = "Clip_127",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x200x12x20xbf16>) -> tensor<1x200x12x20xbf16> loc(#loc92)
+        xten_nn.output %462 : tensor<1x200x12x20xbf16> loc(#loc92)
+      } -> tensor<1x200x12x20xbf16> loc(#loc92)
+      xten_nn.output %461 : tensor<1x200x12x20xbf16> loc(#loc92)
+    } -> tensor<1x200x12x20xbf16> loc(#loc92)
+    %237 = xten_nn.subgraph (%arg5 = %236: tensor<1x200x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_129",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_129",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x200x12x20xbf16>)  attributes {
+        LayerName = "Div_129",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_129",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_129",
+          OutputName = "Div_129",
+          shift = 0 : i8} : (tensor<1x200x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x200x12x20xbf16> loc(#loc93)
+        xten_nn.output %463 : tensor<1x200x12x20xbf16> loc(#loc93)
+      } -> tensor<1x200x12x20xbf16> loc(#loc93)
+      xten_nn.output %461 : tensor<1x200x12x20xbf16> loc(#loc93)
+    } -> tensor<1x200x12x20xbf16> loc(#loc93)
+    %238 = xten_nn.subgraph (%arg5 = %234: tensor<1x200x12x20xbf16>, %arg6 = %237: tensor<1x200x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_130",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_130",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x200x12x20xbf16>, %arg8 = %arg6: tensor<1x200x12x20xbf16>)  attributes {
+        LayerName = "Mul_130",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_130",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_130",
+          OutputName = "Mul_130",
+          shift = 0 : i8} : (tensor<1x200x12x20xbf16>, tensor<1x200x12x20xbf16>) -> tensor<1x200x12x20xbf16> loc(#loc94)
+        xten_nn.output %462 : tensor<1x200x12x20xbf16> loc(#loc94)
+      } -> tensor<1x200x12x20xbf16> loc(#loc94)
+      xten_nn.output %461 : tensor<1x200x12x20xbf16> loc(#loc94)
+    } -> tensor<1x200x12x20xbf16> loc(#loc94)
+    %239 = xten_nn.subgraph (%arg5 = %238: tensor<1x200x12x20xbf16>, %arg6 = %101: tensor<80x200x1x1xbf16>, %arg7 = %100: tensor<80xbf16>, %arg8 = %228: tensor<1x80x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 3 : index],
+      LayerName = "Conv_131",
+      OfmShare = 3 : index,
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[80, 200, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_132",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg9 = %arg5: tensor<1x200x12x20xbf16>, %arg10 = %arg6: tensor<80x200x1x1xbf16>, %arg11 = %arg7: tensor<80xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_131",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 200, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[80, 200, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_131",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %463 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc95)
+        %465 = tosa.reshape %arg10 {new_shape = array<i64: 80, 1, 1, 200>} : (tensor<80x200x1x1xbf16>) -> tensor<80x1x1x200xbf16> loc(#loc95)
+        %466 = tosa.transpose %arg9, %464 : (tensor<1x200x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x200xbf16> loc(#loc95)
+        %467 = tosa.conv2d %466, %465, %arg11 {
+          PartOfLayerName = "Conv_131",
+          PartOfOutputName = "Conv_131",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x200xbf16>, tensor<80x1x1x200xbf16>, tensor<80xbf16>) -> tensor<1x12x20x80xbf16> loc(#loc95)
+        %468 = tosa.transpose %467, %463 : (tensor<1x12x20x80xbf16>, tensor<4xi32>) -> tensor<1x80x12x20xbf16> loc(#loc95)
+        xten_nn.output %468 : tensor<1x80x12x20xbf16> loc(#loc95)
+      } -> tensor<1x80x12x20xbf16> loc(#loc95)
+      %462 = xten_nn.subgraph (%arg9 = %461: tensor<1x80x12x20xbf16>, %arg10 = %arg8: tensor<1x80x12x20xbf16>)  attributes {
+        LayerName = "Add_132",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_132",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.act = 0 : ui8,
+          config.act_type = "LINEAR",
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %463 = tosa.add %arg9, %arg10 {LayerName = "Add_132", OutputName = "Add_132"} : (tensor<1x80x12x20xbf16>, tensor<1x80x12x20xbf16>) -> tensor<1x80x12x20xbf16> loc(#loc96)
+        xten_nn.output %463 : tensor<1x80x12x20xbf16> loc(#loc96)
+      } -> tensor<1x80x12x20xbf16> loc(#loc96)
+      xten_nn.output %462 : tensor<1x80x12x20xbf16> loc(#loc96)
+    } -> tensor<1x80x12x20xbf16> loc(#loc330)
+    %240 = xten_nn.subgraph (%arg5 = %239: tensor<1x80x12x20xbf16>, %arg6 = %99: tensor<184x80x1x1xbf16>, %arg7 = %98: tensor<184xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_133",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[184, 80, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_133",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x80x12x20xbf16>, %arg9 = %arg6: tensor<184x80x1x1xbf16>, %arg10 = %arg7: tensor<184xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_133",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[184, 80, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_133",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc97)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 184, 1, 1, 80>} : (tensor<184x80x1x1xbf16>) -> tensor<184x1x1x80xbf16> loc(#loc97)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x80x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x80xbf16> loc(#loc97)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_133",
+          PartOfOutputName = "Conv_133",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x80xbf16>, tensor<184x1x1x80xbf16>, tensor<184xbf16>) -> tensor<1x12x20x184xbf16> loc(#loc97)
+        %467 = tosa.transpose %466, %462 : (tensor<1x12x20x184xbf16>, tensor<4xi32>) -> tensor<1x184x12x20xbf16> loc(#loc97)
+        xten_nn.output %467 : tensor<1x184x12x20xbf16> loc(#loc97)
+      } -> tensor<1x184x12x20xbf16> loc(#loc97)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc97)
+    } -> tensor<1x184x12x20xbf16> loc(#loc97)
+    %241 = xten_nn.subgraph (%arg5 = %240: tensor<1x184x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_135",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_135",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x184x12x20xbf16>)  attributes {
+        LayerName = "Add_135",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_135",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_135", OutputName = "Add_135"} : (tensor<1x184x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x184x12x20xbf16> loc(#loc98)
+        xten_nn.output %463 : tensor<1x184x12x20xbf16> loc(#loc98)
+      } -> tensor<1x184x12x20xbf16> loc(#loc98)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc98)
+    } -> tensor<1x184x12x20xbf16> loc(#loc98)
+    %242 = xten_nn.subgraph (%arg5 = %241: tensor<1x184x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_138",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_138",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x184x12x20xbf16>)  attributes {
+        LayerName = "Clip_138",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_138",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_138",
+          OutputName = "Clip_138",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x184x12x20xbf16>) -> tensor<1x184x12x20xbf16> loc(#loc99)
+        xten_nn.output %462 : tensor<1x184x12x20xbf16> loc(#loc99)
+      } -> tensor<1x184x12x20xbf16> loc(#loc99)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc99)
+    } -> tensor<1x184x12x20xbf16> loc(#loc99)
+    %243 = xten_nn.subgraph (%arg5 = %242: tensor<1x184x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_140",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_140",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x184x12x20xbf16>)  attributes {
+        LayerName = "Div_140",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_140",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_140",
+          OutputName = "Div_140",
+          shift = 0 : i8} : (tensor<1x184x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x184x12x20xbf16> loc(#loc100)
+        xten_nn.output %463 : tensor<1x184x12x20xbf16> loc(#loc100)
+      } -> tensor<1x184x12x20xbf16> loc(#loc100)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc100)
+    } -> tensor<1x184x12x20xbf16> loc(#loc100)
+    %244 = xten_nn.subgraph (%arg5 = %240: tensor<1x184x12x20xbf16>, %arg6 = %243: tensor<1x184x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_141",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_141",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x184x12x20xbf16>, %arg8 = %arg6: tensor<1x184x12x20xbf16>)  attributes {
+        LayerName = "Mul_141",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_141",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_141",
+          OutputName = "Mul_141",
+          shift = 0 : i8} : (tensor<1x184x12x20xbf16>, tensor<1x184x12x20xbf16>) -> tensor<1x184x12x20xbf16> loc(#loc101)
+        xten_nn.output %462 : tensor<1x184x12x20xbf16> loc(#loc101)
+      } -> tensor<1x184x12x20xbf16> loc(#loc101)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc101)
+    } -> tensor<1x184x12x20xbf16> loc(#loc101)
+    %245 = xten_nn.subgraph (%arg5 = %244: tensor<1x184x12x20xbf16>, %arg6 = %97: tensor<184x1x3x3xbf16>, %arg7 = %96: tensor<184xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_142",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "CMHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[184, 1, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_142",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x184x12x20xbf16>, %arg9 = %arg6: tensor<184x1x3x3xbf16>, %arg10 = %arg7: tensor<184xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_142",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "CMHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[184, 1, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_142",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "DepthwiseConv2dBf16",
+        With = {
+          config.act = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.kernel_height = 3 : ui8,
+          config.kernel_width = 3 : ui8,
+          config.stride = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[2, 3, 0, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc102)
+        %465 = tosa.transpose %arg9, %464 : (tensor<184x1x3x3xbf16>, tensor<4xi32>) -> tensor<3x3x184x1xbf16> loc(#loc102)
+        %466 = tosa.transpose %arg8, %463 : (tensor<1x184x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x184xbf16> loc(#loc102)
+        %467 = tosa.depthwise_conv2d %466, %465, %arg10 {
+          PartOfLayerName = "Conv_142",
+          PartOfOutputName = "Conv_142",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x184xbf16>, tensor<3x3x184x1xbf16>, tensor<184xbf16>) -> tensor<1x12x20x184xbf16> loc(#loc102)
+        %468 = tosa.transpose %467, %462 : (tensor<1x12x20x184xbf16>, tensor<4xi32>) -> tensor<1x184x12x20xbf16> loc(#loc102)
+        xten_nn.output %468 : tensor<1x184x12x20xbf16> loc(#loc102)
+      } -> tensor<1x184x12x20xbf16> loc(#loc102)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc102)
+    } -> tensor<1x184x12x20xbf16> loc(#loc102)
+    %246 = xten_nn.subgraph (%arg5 = %245: tensor<1x184x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_144",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_144",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x184x12x20xbf16>)  attributes {
+        LayerName = "Add_144",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_144",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_144", OutputName = "Add_144"} : (tensor<1x184x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x184x12x20xbf16> loc(#loc103)
+        xten_nn.output %463 : tensor<1x184x12x20xbf16> loc(#loc103)
+      } -> tensor<1x184x12x20xbf16> loc(#loc103)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc103)
+    } -> tensor<1x184x12x20xbf16> loc(#loc103)
+    %247 = xten_nn.subgraph (%arg5 = %246: tensor<1x184x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_147",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_147",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x184x12x20xbf16>)  attributes {
+        LayerName = "Clip_147",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_147",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_147",
+          OutputName = "Clip_147",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x184x12x20xbf16>) -> tensor<1x184x12x20xbf16> loc(#loc104)
+        xten_nn.output %462 : tensor<1x184x12x20xbf16> loc(#loc104)
+      } -> tensor<1x184x12x20xbf16> loc(#loc104)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc104)
+    } -> tensor<1x184x12x20xbf16> loc(#loc104)
+    %248 = xten_nn.subgraph (%arg5 = %247: tensor<1x184x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_149",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_149",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x184x12x20xbf16>)  attributes {
+        LayerName = "Div_149",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_149",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_149",
+          OutputName = "Div_149",
+          shift = 0 : i8} : (tensor<1x184x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x184x12x20xbf16> loc(#loc105)
+        xten_nn.output %463 : tensor<1x184x12x20xbf16> loc(#loc105)
+      } -> tensor<1x184x12x20xbf16> loc(#loc105)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc105)
+    } -> tensor<1x184x12x20xbf16> loc(#loc105)
+    %249 = xten_nn.subgraph (%arg5 = %245: tensor<1x184x12x20xbf16>, %arg6 = %248: tensor<1x184x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_150",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_150",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x184x12x20xbf16>, %arg8 = %arg6: tensor<1x184x12x20xbf16>)  attributes {
+        LayerName = "Mul_150",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_150",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_150",
+          OutputName = "Mul_150",
+          shift = 0 : i8} : (tensor<1x184x12x20xbf16>, tensor<1x184x12x20xbf16>) -> tensor<1x184x12x20xbf16> loc(#loc106)
+        xten_nn.output %462 : tensor<1x184x12x20xbf16> loc(#loc106)
+      } -> tensor<1x184x12x20xbf16> loc(#loc106)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc106)
+    } -> tensor<1x184x12x20xbf16> loc(#loc106)
+    %250 = xten_nn.subgraph (%arg5 = %249: tensor<1x184x12x20xbf16>, %arg6 = %95: tensor<80x184x1x1xbf16>, %arg7 = %94: tensor<80xbf16>, %arg8 = %239: tensor<1x80x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 3 : index],
+      LayerName = "Conv_151",
+      OfmShare = 3 : index,
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[80, 184, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_152",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg9 = %arg5: tensor<1x184x12x20xbf16>, %arg10 = %arg6: tensor<80x184x1x1xbf16>, %arg11 = %arg7: tensor<80xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_151",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[80, 184, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_151",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %463 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc107)
+        %465 = tosa.reshape %arg10 {new_shape = array<i64: 80, 1, 1, 184>} : (tensor<80x184x1x1xbf16>) -> tensor<80x1x1x184xbf16> loc(#loc107)
+        %466 = tosa.transpose %arg9, %464 : (tensor<1x184x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x184xbf16> loc(#loc107)
+        %467 = tosa.conv2d %466, %465, %arg11 {
+          PartOfLayerName = "Conv_151",
+          PartOfOutputName = "Conv_151",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x184xbf16>, tensor<80x1x1x184xbf16>, tensor<80xbf16>) -> tensor<1x12x20x80xbf16> loc(#loc107)
+        %468 = tosa.transpose %467, %463 : (tensor<1x12x20x80xbf16>, tensor<4xi32>) -> tensor<1x80x12x20xbf16> loc(#loc107)
+        xten_nn.output %468 : tensor<1x80x12x20xbf16> loc(#loc107)
+      } -> tensor<1x80x12x20xbf16> loc(#loc107)
+      %462 = xten_nn.subgraph (%arg9 = %461: tensor<1x80x12x20xbf16>, %arg10 = %arg8: tensor<1x80x12x20xbf16>)  attributes {
+        LayerName = "Add_152",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_152",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.act = 0 : ui8,
+          config.act_type = "LINEAR",
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %463 = tosa.add %arg9, %arg10 {LayerName = "Add_152", OutputName = "Add_152"} : (tensor<1x80x12x20xbf16>, tensor<1x80x12x20xbf16>) -> tensor<1x80x12x20xbf16> loc(#loc108)
+        xten_nn.output %463 : tensor<1x80x12x20xbf16> loc(#loc108)
+      } -> tensor<1x80x12x20xbf16> loc(#loc108)
+      xten_nn.output %462 : tensor<1x80x12x20xbf16> loc(#loc108)
+    } -> tensor<1x80x12x20xbf16> loc(#loc331)
+    %251 = xten_nn.subgraph (%arg5 = %250: tensor<1x80x12x20xbf16>, %arg6 = %93: tensor<184x80x1x1xbf16>, %arg7 = %92: tensor<184xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_153",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[184, 80, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_153",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x80x12x20xbf16>, %arg9 = %arg6: tensor<184x80x1x1xbf16>, %arg10 = %arg7: tensor<184xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_153",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[184, 80, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_153",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc109)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 184, 1, 1, 80>} : (tensor<184x80x1x1xbf16>) -> tensor<184x1x1x80xbf16> loc(#loc109)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x80x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x80xbf16> loc(#loc109)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_153",
+          PartOfOutputName = "Conv_153",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x80xbf16>, tensor<184x1x1x80xbf16>, tensor<184xbf16>) -> tensor<1x12x20x184xbf16> loc(#loc109)
+        %467 = tosa.transpose %466, %462 : (tensor<1x12x20x184xbf16>, tensor<4xi32>) -> tensor<1x184x12x20xbf16> loc(#loc109)
+        xten_nn.output %467 : tensor<1x184x12x20xbf16> loc(#loc109)
+      } -> tensor<1x184x12x20xbf16> loc(#loc109)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc109)
+    } -> tensor<1x184x12x20xbf16> loc(#loc109)
+    %252 = xten_nn.subgraph (%arg5 = %251: tensor<1x184x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_155",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_155",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x184x12x20xbf16>)  attributes {
+        LayerName = "Add_155",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_155",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_155", OutputName = "Add_155"} : (tensor<1x184x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x184x12x20xbf16> loc(#loc110)
+        xten_nn.output %463 : tensor<1x184x12x20xbf16> loc(#loc110)
+      } -> tensor<1x184x12x20xbf16> loc(#loc110)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc110)
+    } -> tensor<1x184x12x20xbf16> loc(#loc110)
+    %253 = xten_nn.subgraph (%arg5 = %252: tensor<1x184x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_158",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_158",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x184x12x20xbf16>)  attributes {
+        LayerName = "Clip_158",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_158",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_158",
+          OutputName = "Clip_158",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x184x12x20xbf16>) -> tensor<1x184x12x20xbf16> loc(#loc111)
+        xten_nn.output %462 : tensor<1x184x12x20xbf16> loc(#loc111)
+      } -> tensor<1x184x12x20xbf16> loc(#loc111)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc111)
+    } -> tensor<1x184x12x20xbf16> loc(#loc111)
+    %254 = xten_nn.subgraph (%arg5 = %253: tensor<1x184x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_160",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_160",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x184x12x20xbf16>)  attributes {
+        LayerName = "Div_160",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_160",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_160",
+          OutputName = "Div_160",
+          shift = 0 : i8} : (tensor<1x184x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x184x12x20xbf16> loc(#loc112)
+        xten_nn.output %463 : tensor<1x184x12x20xbf16> loc(#loc112)
+      } -> tensor<1x184x12x20xbf16> loc(#loc112)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc112)
+    } -> tensor<1x184x12x20xbf16> loc(#loc112)
+    %255 = xten_nn.subgraph (%arg5 = %251: tensor<1x184x12x20xbf16>, %arg6 = %254: tensor<1x184x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_161",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_161",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x184x12x20xbf16>, %arg8 = %arg6: tensor<1x184x12x20xbf16>)  attributes {
+        LayerName = "Mul_161",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_161",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_161",
+          OutputName = "Mul_161",
+          shift = 0 : i8} : (tensor<1x184x12x20xbf16>, tensor<1x184x12x20xbf16>) -> tensor<1x184x12x20xbf16> loc(#loc113)
+        xten_nn.output %462 : tensor<1x184x12x20xbf16> loc(#loc113)
+      } -> tensor<1x184x12x20xbf16> loc(#loc113)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc113)
+    } -> tensor<1x184x12x20xbf16> loc(#loc113)
+    %256 = xten_nn.subgraph (%arg5 = %255: tensor<1x184x12x20xbf16>, %arg6 = %91: tensor<184x1x3x3xbf16>, %arg7 = %90: tensor<184xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_162",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "CMHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[184, 1, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_162",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x184x12x20xbf16>, %arg9 = %arg6: tensor<184x1x3x3xbf16>, %arg10 = %arg7: tensor<184xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_162",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "CMHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[184, 1, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_162",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "DepthwiseConv2dBf16",
+        With = {
+          config.act = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.kernel_height = 3 : ui8,
+          config.kernel_width = 3 : ui8,
+          config.stride = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[2, 3, 0, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc114)
+        %465 = tosa.transpose %arg9, %464 : (tensor<184x1x3x3xbf16>, tensor<4xi32>) -> tensor<3x3x184x1xbf16> loc(#loc114)
+        %466 = tosa.transpose %arg8, %463 : (tensor<1x184x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x184xbf16> loc(#loc114)
+        %467 = tosa.depthwise_conv2d %466, %465, %arg10 {
+          PartOfLayerName = "Conv_162",
+          PartOfOutputName = "Conv_162",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x184xbf16>, tensor<3x3x184x1xbf16>, tensor<184xbf16>) -> tensor<1x12x20x184xbf16> loc(#loc114)
+        %468 = tosa.transpose %467, %462 : (tensor<1x12x20x184xbf16>, tensor<4xi32>) -> tensor<1x184x12x20xbf16> loc(#loc114)
+        xten_nn.output %468 : tensor<1x184x12x20xbf16> loc(#loc114)
+      } -> tensor<1x184x12x20xbf16> loc(#loc114)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc114)
+    } -> tensor<1x184x12x20xbf16> loc(#loc114)
+    %257 = xten_nn.subgraph (%arg5 = %256: tensor<1x184x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_164",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_164",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x184x12x20xbf16>)  attributes {
+        LayerName = "Add_164",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_164",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_164", OutputName = "Add_164"} : (tensor<1x184x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x184x12x20xbf16> loc(#loc115)
+        xten_nn.output %463 : tensor<1x184x12x20xbf16> loc(#loc115)
+      } -> tensor<1x184x12x20xbf16> loc(#loc115)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc115)
+    } -> tensor<1x184x12x20xbf16> loc(#loc115)
+    %258 = xten_nn.subgraph (%arg5 = %257: tensor<1x184x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_167",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_167",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x184x12x20xbf16>)  attributes {
+        LayerName = "Clip_167",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_167",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_167",
+          OutputName = "Clip_167",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x184x12x20xbf16>) -> tensor<1x184x12x20xbf16> loc(#loc116)
+        xten_nn.output %462 : tensor<1x184x12x20xbf16> loc(#loc116)
+      } -> tensor<1x184x12x20xbf16> loc(#loc116)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc116)
+    } -> tensor<1x184x12x20xbf16> loc(#loc116)
+    %259 = xten_nn.subgraph (%arg5 = %258: tensor<1x184x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_169",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_169",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x184x12x20xbf16>)  attributes {
+        LayerName = "Div_169",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_169",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_169",
+          OutputName = "Div_169",
+          shift = 0 : i8} : (tensor<1x184x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x184x12x20xbf16> loc(#loc117)
+        xten_nn.output %463 : tensor<1x184x12x20xbf16> loc(#loc117)
+      } -> tensor<1x184x12x20xbf16> loc(#loc117)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc117)
+    } -> tensor<1x184x12x20xbf16> loc(#loc117)
+    %260 = xten_nn.subgraph (%arg5 = %256: tensor<1x184x12x20xbf16>, %arg6 = %259: tensor<1x184x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_170",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_170",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x184x12x20xbf16>, %arg8 = %arg6: tensor<1x184x12x20xbf16>)  attributes {
+        LayerName = "Mul_170",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_170",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_170",
+          OutputName = "Mul_170",
+          shift = 0 : i8} : (tensor<1x184x12x20xbf16>, tensor<1x184x12x20xbf16>) -> tensor<1x184x12x20xbf16> loc(#loc118)
+        xten_nn.output %462 : tensor<1x184x12x20xbf16> loc(#loc118)
+      } -> tensor<1x184x12x20xbf16> loc(#loc118)
+      xten_nn.output %461 : tensor<1x184x12x20xbf16> loc(#loc118)
+    } -> tensor<1x184x12x20xbf16> loc(#loc118)
+    %261 = xten_nn.subgraph (%arg5 = %260: tensor<1x184x12x20xbf16>, %arg6 = %89: tensor<80x184x1x1xbf16>, %arg7 = %88: tensor<80xbf16>, %arg8 = %250: tensor<1x80x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 3 : index],
+      LayerName = "Conv_171",
+      OfmShare = 3 : index,
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[80, 184, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_172",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg9 = %arg5: tensor<1x184x12x20xbf16>, %arg10 = %arg6: tensor<80x184x1x1xbf16>, %arg11 = %arg7: tensor<80xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_171",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 184, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[80, 184, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_171",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %463 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc119)
+        %465 = tosa.reshape %arg10 {new_shape = array<i64: 80, 1, 1, 184>} : (tensor<80x184x1x1xbf16>) -> tensor<80x1x1x184xbf16> loc(#loc119)
+        %466 = tosa.transpose %arg9, %464 : (tensor<1x184x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x184xbf16> loc(#loc119)
+        %467 = tosa.conv2d %466, %465, %arg11 {
+          PartOfLayerName = "Conv_171",
+          PartOfOutputName = "Conv_171",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x184xbf16>, tensor<80x1x1x184xbf16>, tensor<80xbf16>) -> tensor<1x12x20x80xbf16> loc(#loc119)
+        %468 = tosa.transpose %467, %463 : (tensor<1x12x20x80xbf16>, tensor<4xi32>) -> tensor<1x80x12x20xbf16> loc(#loc119)
+        xten_nn.output %468 : tensor<1x80x12x20xbf16> loc(#loc119)
+      } -> tensor<1x80x12x20xbf16> loc(#loc119)
+      %462 = xten_nn.subgraph (%arg9 = %461: tensor<1x80x12x20xbf16>, %arg10 = %arg8: tensor<1x80x12x20xbf16>)  attributes {
+        LayerName = "Add_172",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_172",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.act = 0 : ui8,
+          config.act_type = "LINEAR",
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %463 = tosa.add %arg9, %arg10 {LayerName = "Add_172", OutputName = "Add_172"} : (tensor<1x80x12x20xbf16>, tensor<1x80x12x20xbf16>) -> tensor<1x80x12x20xbf16> loc(#loc120)
+        xten_nn.output %463 : tensor<1x80x12x20xbf16> loc(#loc120)
+      } -> tensor<1x80x12x20xbf16> loc(#loc120)
+      xten_nn.output %462 : tensor<1x80x12x20xbf16> loc(#loc120)
+    } -> tensor<1x80x12x20xbf16> loc(#loc332)
+    %262 = xten_nn.subgraph (%arg5 = %261: tensor<1x80x12x20xbf16>, %arg6 = %87: tensor<480x80x1x1xbf16>, %arg7 = %86: tensor<480xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_173",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[480, 80, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_173",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x80x12x20xbf16>, %arg9 = %arg6: tensor<480x80x1x1xbf16>, %arg10 = %arg7: tensor<480xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_173",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[480, 80, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_173",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc121)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 480, 1, 1, 80>} : (tensor<480x80x1x1xbf16>) -> tensor<480x1x1x80xbf16> loc(#loc121)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x80x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x80xbf16> loc(#loc121)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_173",
+          PartOfOutputName = "Conv_173",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x80xbf16>, tensor<480x1x1x80xbf16>, tensor<480xbf16>) -> tensor<1x12x20x480xbf16> loc(#loc121)
+        %467 = tosa.transpose %466, %462 : (tensor<1x12x20x480xbf16>, tensor<4xi32>) -> tensor<1x480x12x20xbf16> loc(#loc121)
+        xten_nn.output %467 : tensor<1x480x12x20xbf16> loc(#loc121)
+      } -> tensor<1x480x12x20xbf16> loc(#loc121)
+      xten_nn.output %461 : tensor<1x480x12x20xbf16> loc(#loc121)
+    } -> tensor<1x480x12x20xbf16> loc(#loc121)
+    %263 = xten_nn.subgraph (%arg5 = %262: tensor<1x480x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_175",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_175",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x480x12x20xbf16>)  attributes {
+        LayerName = "Add_175",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_175",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_175", OutputName = "Add_175"} : (tensor<1x480x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x480x12x20xbf16> loc(#loc122)
+        xten_nn.output %463 : tensor<1x480x12x20xbf16> loc(#loc122)
+      } -> tensor<1x480x12x20xbf16> loc(#loc122)
+      xten_nn.output %461 : tensor<1x480x12x20xbf16> loc(#loc122)
+    } -> tensor<1x480x12x20xbf16> loc(#loc122)
+    %264 = xten_nn.subgraph (%arg5 = %263: tensor<1x480x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_178",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_178",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x480x12x20xbf16>)  attributes {
+        LayerName = "Clip_178",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_178",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_178",
+          OutputName = "Clip_178",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x480x12x20xbf16>) -> tensor<1x480x12x20xbf16> loc(#loc123)
+        xten_nn.output %462 : tensor<1x480x12x20xbf16> loc(#loc123)
+      } -> tensor<1x480x12x20xbf16> loc(#loc123)
+      xten_nn.output %461 : tensor<1x480x12x20xbf16> loc(#loc123)
+    } -> tensor<1x480x12x20xbf16> loc(#loc123)
+    %265 = xten_nn.subgraph (%arg5 = %264: tensor<1x480x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_180",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_180",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x480x12x20xbf16>)  attributes {
+        LayerName = "Div_180",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_180",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_180",
+          OutputName = "Div_180",
+          shift = 0 : i8} : (tensor<1x480x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x480x12x20xbf16> loc(#loc124)
+        xten_nn.output %463 : tensor<1x480x12x20xbf16> loc(#loc124)
+      } -> tensor<1x480x12x20xbf16> loc(#loc124)
+      xten_nn.output %461 : tensor<1x480x12x20xbf16> loc(#loc124)
+    } -> tensor<1x480x12x20xbf16> loc(#loc124)
+    %266 = xten_nn.subgraph (%arg5 = %262: tensor<1x480x12x20xbf16>, %arg6 = %265: tensor<1x480x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_181",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_181",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x480x12x20xbf16>, %arg8 = %arg6: tensor<1x480x12x20xbf16>)  attributes {
+        LayerName = "Mul_181",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_181",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_181",
+          OutputName = "Mul_181",
+          shift = 0 : i8} : (tensor<1x480x12x20xbf16>, tensor<1x480x12x20xbf16>) -> tensor<1x480x12x20xbf16> loc(#loc125)
+        xten_nn.output %462 : tensor<1x480x12x20xbf16> loc(#loc125)
+      } -> tensor<1x480x12x20xbf16> loc(#loc125)
+      xten_nn.output %461 : tensor<1x480x12x20xbf16> loc(#loc125)
+    } -> tensor<1x480x12x20xbf16> loc(#loc125)
+    %267 = xten_nn.subgraph (%arg5 = %266: tensor<1x480x12x20xbf16>, %arg6 = %85: tensor<480x1x3x3xbf16>, %arg7 = %84: tensor<480xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_182",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "CMHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[480, 1, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_182",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x480x12x20xbf16>, %arg9 = %arg6: tensor<480x1x3x3xbf16>, %arg10 = %arg7: tensor<480xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_182",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "CMHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[480, 1, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_182",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "DepthwiseConv2dBf16",
+        With = {
+          config.act = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.kernel_height = 3 : ui8,
+          config.kernel_width = 3 : ui8,
+          config.stride = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[2, 3, 0, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc126)
+        %465 = tosa.transpose %arg9, %464 : (tensor<480x1x3x3xbf16>, tensor<4xi32>) -> tensor<3x3x480x1xbf16> loc(#loc126)
+        %466 = tosa.transpose %arg8, %463 : (tensor<1x480x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x480xbf16> loc(#loc126)
+        %467 = tosa.depthwise_conv2d %466, %465, %arg10 {
+          PartOfLayerName = "Conv_182",
+          PartOfOutputName = "Conv_182",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x480xbf16>, tensor<3x3x480x1xbf16>, tensor<480xbf16>) -> tensor<1x12x20x480xbf16> loc(#loc126)
+        %468 = tosa.transpose %467, %462 : (tensor<1x12x20x480xbf16>, tensor<4xi32>) -> tensor<1x480x12x20xbf16> loc(#loc126)
+        xten_nn.output %468 : tensor<1x480x12x20xbf16> loc(#loc126)
+      } -> tensor<1x480x12x20xbf16> loc(#loc126)
+      xten_nn.output %461 : tensor<1x480x12x20xbf16> loc(#loc126)
+    } -> tensor<1x480x12x20xbf16> loc(#loc126)
+    %268 = xten_nn.subgraph (%arg5 = %267: tensor<1x480x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_184",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_184",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x480x12x20xbf16>)  attributes {
+        LayerName = "Add_184",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_184",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_184", OutputName = "Add_184"} : (tensor<1x480x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x480x12x20xbf16> loc(#loc127)
+        xten_nn.output %463 : tensor<1x480x12x20xbf16> loc(#loc127)
+      } -> tensor<1x480x12x20xbf16> loc(#loc127)
+      xten_nn.output %461 : tensor<1x480x12x20xbf16> loc(#loc127)
+    } -> tensor<1x480x12x20xbf16> loc(#loc127)
+    %269 = xten_nn.subgraph (%arg5 = %268: tensor<1x480x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_187",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_187",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x480x12x20xbf16>)  attributes {
+        LayerName = "Clip_187",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_187",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_187",
+          OutputName = "Clip_187",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x480x12x20xbf16>) -> tensor<1x480x12x20xbf16> loc(#loc128)
+        xten_nn.output %462 : tensor<1x480x12x20xbf16> loc(#loc128)
+      } -> tensor<1x480x12x20xbf16> loc(#loc128)
+      xten_nn.output %461 : tensor<1x480x12x20xbf16> loc(#loc128)
+    } -> tensor<1x480x12x20xbf16> loc(#loc128)
+    %270 = xten_nn.subgraph (%arg5 = %269: tensor<1x480x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_189",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_189",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x480x12x20xbf16>)  attributes {
+        LayerName = "Div_189",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_189",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_189",
+          OutputName = "Div_189",
+          shift = 0 : i8} : (tensor<1x480x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x480x12x20xbf16> loc(#loc129)
+        xten_nn.output %463 : tensor<1x480x12x20xbf16> loc(#loc129)
+      } -> tensor<1x480x12x20xbf16> loc(#loc129)
+      xten_nn.output %461 : tensor<1x480x12x20xbf16> loc(#loc129)
+    } -> tensor<1x480x12x20xbf16> loc(#loc129)
+    %271 = xten_nn.subgraph (%arg5 = %267: tensor<1x480x12x20xbf16>, %arg6 = %270: tensor<1x480x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_190",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_190",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x480x12x20xbf16>, %arg8 = %arg6: tensor<1x480x12x20xbf16>)  attributes {
+        LayerName = "Mul_190",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_190",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_190",
+          OutputName = "Mul_190",
+          shift = 0 : i8} : (tensor<1x480x12x20xbf16>, tensor<1x480x12x20xbf16>) -> tensor<1x480x12x20xbf16> loc(#loc130)
+        xten_nn.output %462 : tensor<1x480x12x20xbf16> loc(#loc130)
+      } -> tensor<1x480x12x20xbf16> loc(#loc130)
+      xten_nn.output %461 : tensor<1x480x12x20xbf16> loc(#loc130)
+    } -> tensor<1x480x12x20xbf16> loc(#loc130)
+    %272 = xten_nn.subgraph (%arg5 = %271: tensor<1x480x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#24",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#25",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 1, 240]> : vector<4xindex>
+        }
+      ],
+      Specializes = "Transpose4dAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dim_0 = 12 : ui32,
+        config.dim_1 = 60 : ui32,
+        config.dim_2 = 20 : ui32,
+        config.dim_3 = 8 : ui32,
+        config.dtype = "bfloat16",
+        config.perm = 6 : ui32
+      }} {
+      %461 = tosa.reshape %arg5 {new_shape = array<i64: 1, 480, 1, 240>} : (tensor<1x480x12x20xbf16>) -> tensor<1x480x1x240xbf16> loc(#loc333)
+      xten_nn.output %461 : tensor<1x480x1x240xbf16> loc(#loc333)
+    } -> tensor<1x480x1x240xbf16> loc(#loc333)
+    %273 = xten_nn.subgraph (%arg5 = %272: tensor<1x480x1x240xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#26",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 1, 240]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#27",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x480x1x240xbf16>)  attributes {
+        LayerName = "Generated-#26",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 1, 240]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Generated-#27",
+        PadValue = 0.000000e+00 : bf16,
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ReduceMeanC8Bf16",
+        Traits = {
+          Reduce = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.full_channel = 480 : ui32,
+          config.full_height = 1 : ui32,
+          config.full_width = 240 : ui32,
+          config.reduce_dim = "W"
+        }} {
+        %462 = xten_nn.reduce_mean %arg6 {axes = array<i64: 3>, keepdims = 1 : i64} : (tensor<1x480x1x240xbf16>) -> tensor<1x480x1x1xbf16> loc(#loc131)
+        xten_nn.output %462 : tensor<1x480x1x1xbf16> loc(#loc131)
+      } -> tensor<1x480x1x1xbf16> loc(#loc131)
+      xten_nn.output %461 : tensor<1x480x1x1xbf16> loc(#loc131)
+    } -> tensor<1x480x1x1xbf16> loc(#loc131)
+    %274 = xten_nn.subgraph (%arg5 = %273: tensor<1x480x1x1xbf16>, %arg6 = %83: tensor<120x480x1x1xbf16>, %arg7 = %82: tensor<120xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_192",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[120, 480, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_193",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x480x1x1xbf16>, %arg9 = %arg6: tensor<120x480x1x1xbf16>, %arg10 = %arg7: tensor<120xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_192",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[120, 480, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_193",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 120, 1, 1, 480>} : (tensor<120x480x1x1xbf16>) -> tensor<120x1x1x480xbf16> loc(#loc334)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 480>} : (tensor<1x480x1x1xbf16>) -> tensor<1x1x1x480xbf16> loc(#loc334)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_192",
+          PartOfOutputName = "Conv_192",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x480xbf16>, tensor<120x1x1x480xbf16>, tensor<120xbf16>) -> tensor<1x1x1x120xbf16> loc(#loc132)
+        %465 = tosa.clamp %464 {
+          LayerName = "Relu_193",
+          OutputName = "Relu_193",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x1x1x120xbf16>) -> tensor<1x1x1x120xbf16> loc(#loc133)
+        %466 = tosa.reshape %465 {new_shape = array<i64: 1, 120, 1, 1>} : (tensor<1x1x1x120xbf16>) -> tensor<1x120x1x1xbf16> loc(#loc334)
+        xten_nn.output %466 : tensor<1x120x1x1xbf16> loc(#loc133)
+      } -> tensor<1x120x1x1xbf16> loc(#loc334)
+      xten_nn.output %461 : tensor<1x120x1x1xbf16> loc(#loc334)
+    } -> tensor<1x120x1x1xbf16> loc(#loc334)
+    %275 = xten_nn.subgraph (%arg5 = %274: tensor<1x120x1x1xbf16>, %arg6 = %81: tensor<480x120x1x1xbf16>, %arg7 = %80: tensor<480xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_194",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[480, 120, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_194",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x120x1x1xbf16>, %arg9 = %arg6: tensor<480x120x1x1xbf16>, %arg10 = %arg7: tensor<480xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_194",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 120, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[480, 120, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_194",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 480, 1, 1, 120>} : (tensor<480x120x1x1xbf16>) -> tensor<480x1x1x120xbf16> loc(#loc134)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 120>} : (tensor<1x120x1x1xbf16>) -> tensor<1x1x1x120xbf16> loc(#loc134)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_194",
+          PartOfOutputName = "Conv_194",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x120xbf16>, tensor<480x1x1x120xbf16>, tensor<480xbf16>) -> tensor<1x1x1x480xbf16> loc(#loc134)
+        %465 = tosa.reshape %464 {new_shape = array<i64: 1, 480, 1, 1>} : (tensor<1x1x1x480xbf16>) -> tensor<1x480x1x1xbf16> loc(#loc134)
+        xten_nn.output %465 : tensor<1x480x1x1xbf16> loc(#loc134)
+      } -> tensor<1x480x1x1xbf16> loc(#loc134)
+      xten_nn.output %461 : tensor<1x480x1x1xbf16> loc(#loc134)
+    } -> tensor<1x480x1x1xbf16> loc(#loc134)
+    %276 = xten_nn.subgraph (%arg5 = %275: tensor<1x480x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_196",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_196",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x480x1x1xbf16>)  attributes {
+        LayerName = "Add_196",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_196",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_196", OutputName = "Add_196"} : (tensor<1x480x1x1xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x480x1x1xbf16> loc(#loc135)
+        xten_nn.output %463 : tensor<1x480x1x1xbf16> loc(#loc135)
+      } -> tensor<1x480x1x1xbf16> loc(#loc135)
+      xten_nn.output %461 : tensor<1x480x1x1xbf16> loc(#loc135)
+    } -> tensor<1x480x1x1xbf16> loc(#loc135)
+    %277 = xten_nn.subgraph (%arg5 = %276: tensor<1x480x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_199",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_199",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x480x1x1xbf16>)  attributes {
+        LayerName = "Clip_199",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_199",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_199",
+          OutputName = "Clip_199",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x480x1x1xbf16>) -> tensor<1x480x1x1xbf16> loc(#loc136)
+        xten_nn.output %462 : tensor<1x480x1x1xbf16> loc(#loc136)
+      } -> tensor<1x480x1x1xbf16> loc(#loc136)
+      xten_nn.output %461 : tensor<1x480x1x1xbf16> loc(#loc136)
+    } -> tensor<1x480x1x1xbf16> loc(#loc136)
+    %278 = xten_nn.subgraph (%arg5 = %277: tensor<1x480x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_201",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_201",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x480x1x1xbf16>)  attributes {
+        LayerName = "Div_201",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_201",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_201",
+          OutputName = "Div_201",
+          shift = 0 : i8} : (tensor<1x480x1x1xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x480x1x1xbf16> loc(#loc137)
+        xten_nn.output %463 : tensor<1x480x1x1xbf16> loc(#loc137)
+      } -> tensor<1x480x1x1xbf16> loc(#loc137)
+      xten_nn.output %461 : tensor<1x480x1x1xbf16> loc(#loc137)
+    } -> tensor<1x480x1x1xbf16> loc(#loc137)
+    %279 = xten_nn.subgraph (%arg5 = %278: tensor<1x480x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#28",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#29",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      Specializes = "TileAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dtype = "bfloat16",
+        config.i_dim_c = 480 : ui32,
+        config.i_dim_h = 1 : ui32,
+        config.i_dim_n = 1 : ui32,
+        config.i_dim_w = 1 : ui32,
+        config.rep_dim_c = 1 : ui32,
+        config.rep_dim_h = 12 : ui32,
+        config.rep_dim_w = 20 : ui32
+      }} {
+      %461 = tosa.tile %arg5 {multiples = array<i64: 1, 1, 12, 20>} : (tensor<1x480x1x1xbf16>) -> tensor<1x480x12x20xbf16> loc(#loc138)
+      xten_nn.output %461 : tensor<1x480x12x20xbf16> loc(#loc138)
+    } -> tensor<1x480x12x20xbf16> loc(#loc138)
+    %280 = xten_nn.subgraph (%arg5 = %279: tensor<1x480x12x20xbf16>, %arg6 = %271: tensor<1x480x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_202",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_202",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x480x12x20xbf16>, %arg8 = %arg6: tensor<1x480x12x20xbf16>)  attributes {
+        LayerName = "Mul_202",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_202",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_202",
+          OutputName = "Mul_202",
+          shift = 0 : i8} : (tensor<1x480x12x20xbf16>, tensor<1x480x12x20xbf16>) -> tensor<1x480x12x20xbf16> loc(#loc138)
+        xten_nn.output %462 : tensor<1x480x12x20xbf16> loc(#loc138)
+      } -> tensor<1x480x12x20xbf16> loc(#loc138)
+      xten_nn.output %461 : tensor<1x480x12x20xbf16> loc(#loc138)
+    } -> tensor<1x480x12x20xbf16> loc(#loc138)
+    %281 = xten_nn.subgraph (%arg5 = %280: tensor<1x480x12x20xbf16>, %arg6 = %79: tensor<112x480x1x1xbf16>, %arg7 = %78: tensor<112xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_203",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[112, 480, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_203",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 112, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x480x12x20xbf16>, %arg9 = %arg6: tensor<112x480x1x1xbf16>, %arg10 = %arg7: tensor<112xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_203",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 480, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[112, 480, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_203",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 112, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc139)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 112, 1, 1, 480>} : (tensor<112x480x1x1xbf16>) -> tensor<112x1x1x480xbf16> loc(#loc139)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x480x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x480xbf16> loc(#loc139)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_203",
+          PartOfOutputName = "Conv_203",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x480xbf16>, tensor<112x1x1x480xbf16>, tensor<112xbf16>) -> tensor<1x12x20x112xbf16> loc(#loc139)
+        %467 = tosa.transpose %466, %462 : (tensor<1x12x20x112xbf16>, tensor<4xi32>) -> tensor<1x112x12x20xbf16> loc(#loc139)
+        xten_nn.output %467 : tensor<1x112x12x20xbf16> loc(#loc139)
+      } -> tensor<1x112x12x20xbf16> loc(#loc139)
+      xten_nn.output %461 : tensor<1x112x12x20xbf16> loc(#loc139)
+    } -> tensor<1x112x12x20xbf16> loc(#loc139)
+    %282 = xten_nn.subgraph (%arg5 = %281: tensor<1x112x12x20xbf16>, %arg6 = %77: tensor<672x112x1x1xbf16>, %arg7 = %76: tensor<672xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_204",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 112, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[672, 112, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_204",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x112x12x20xbf16>, %arg9 = %arg6: tensor<672x112x1x1xbf16>, %arg10 = %arg7: tensor<672xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_204",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 112, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[672, 112, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_204",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc140)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 672, 1, 1, 112>} : (tensor<672x112x1x1xbf16>) -> tensor<672x1x1x112xbf16> loc(#loc140)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x112x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x112xbf16> loc(#loc140)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_204",
+          PartOfOutputName = "Conv_204",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x112xbf16>, tensor<672x1x1x112xbf16>, tensor<672xbf16>) -> tensor<1x12x20x672xbf16> loc(#loc140)
+        %467 = tosa.transpose %466, %462 : (tensor<1x12x20x672xbf16>, tensor<4xi32>) -> tensor<1x672x12x20xbf16> loc(#loc140)
+        xten_nn.output %467 : tensor<1x672x12x20xbf16> loc(#loc140)
+      } -> tensor<1x672x12x20xbf16> loc(#loc140)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc140)
+    } -> tensor<1x672x12x20xbf16> loc(#loc140)
+    %283 = xten_nn.subgraph (%arg5 = %282: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_206",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_206",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Add_206",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_206",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_206", OutputName = "Add_206"} : (tensor<1x672x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc141)
+        xten_nn.output %463 : tensor<1x672x12x20xbf16> loc(#loc141)
+      } -> tensor<1x672x12x20xbf16> loc(#loc141)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc141)
+    } -> tensor<1x672x12x20xbf16> loc(#loc141)
+    %284 = xten_nn.subgraph (%arg5 = %283: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_209",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_209",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Clip_209",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_209",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_209",
+          OutputName = "Clip_209",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x672x12x20xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc142)
+        xten_nn.output %462 : tensor<1x672x12x20xbf16> loc(#loc142)
+      } -> tensor<1x672x12x20xbf16> loc(#loc142)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc142)
+    } -> tensor<1x672x12x20xbf16> loc(#loc142)
+    %285 = xten_nn.subgraph (%arg5 = %284: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_211",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_211",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Div_211",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_211",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_211",
+          OutputName = "Div_211",
+          shift = 0 : i8} : (tensor<1x672x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc143)
+        xten_nn.output %463 : tensor<1x672x12x20xbf16> loc(#loc143)
+      } -> tensor<1x672x12x20xbf16> loc(#loc143)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc143)
+    } -> tensor<1x672x12x20xbf16> loc(#loc143)
+    %286 = xten_nn.subgraph (%arg5 = %282: tensor<1x672x12x20xbf16>, %arg6 = %285: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_212",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_212",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x672x12x20xbf16>, %arg8 = %arg6: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Mul_212",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_212",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_212",
+          OutputName = "Mul_212",
+          shift = 0 : i8} : (tensor<1x672x12x20xbf16>, tensor<1x672x12x20xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc144)
+        xten_nn.output %462 : tensor<1x672x12x20xbf16> loc(#loc144)
+      } -> tensor<1x672x12x20xbf16> loc(#loc144)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc144)
+    } -> tensor<1x672x12x20xbf16> loc(#loc144)
+    %287 = xten_nn.subgraph (%arg5 = %286: tensor<1x672x12x20xbf16>, %arg6 = %75: tensor<672x1x3x3xbf16>, %arg7 = %74: tensor<672xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_213",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "CMHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[672, 1, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_213",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x672x12x20xbf16>, %arg9 = %arg6: tensor<672x1x3x3xbf16>, %arg10 = %arg7: tensor<672xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_213",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "CMHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[672, 1, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_213",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "DepthwiseConv2dBf16",
+        With = {
+          config.act = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.kernel_height = 3 : ui8,
+          config.kernel_width = 3 : ui8,
+          config.stride = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[2, 3, 0, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc145)
+        %465 = tosa.transpose %arg9, %464 : (tensor<672x1x3x3xbf16>, tensor<4xi32>) -> tensor<3x3x672x1xbf16> loc(#loc145)
+        %466 = tosa.transpose %arg8, %463 : (tensor<1x672x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x672xbf16> loc(#loc145)
+        %467 = tosa.depthwise_conv2d %466, %465, %arg10 {
+          PartOfLayerName = "Conv_213",
+          PartOfOutputName = "Conv_213",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x672xbf16>, tensor<3x3x672x1xbf16>, tensor<672xbf16>) -> tensor<1x12x20x672xbf16> loc(#loc145)
+        %468 = tosa.transpose %467, %462 : (tensor<1x12x20x672xbf16>, tensor<4xi32>) -> tensor<1x672x12x20xbf16> loc(#loc145)
+        xten_nn.output %468 : tensor<1x672x12x20xbf16> loc(#loc145)
+      } -> tensor<1x672x12x20xbf16> loc(#loc145)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc145)
+    } -> tensor<1x672x12x20xbf16> loc(#loc145)
+    %288 = xten_nn.subgraph (%arg5 = %287: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_215",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_215",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Add_215",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_215",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_215", OutputName = "Add_215"} : (tensor<1x672x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc146)
+        xten_nn.output %463 : tensor<1x672x12x20xbf16> loc(#loc146)
+      } -> tensor<1x672x12x20xbf16> loc(#loc146)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc146)
+    } -> tensor<1x672x12x20xbf16> loc(#loc146)
+    %289 = xten_nn.subgraph (%arg5 = %288: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_218",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_218",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Clip_218",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_218",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_218",
+          OutputName = "Clip_218",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x672x12x20xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc147)
+        xten_nn.output %462 : tensor<1x672x12x20xbf16> loc(#loc147)
+      } -> tensor<1x672x12x20xbf16> loc(#loc147)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc147)
+    } -> tensor<1x672x12x20xbf16> loc(#loc147)
+    %290 = xten_nn.subgraph (%arg5 = %289: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_220",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_220",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Div_220",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_220",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_220",
+          OutputName = "Div_220",
+          shift = 0 : i8} : (tensor<1x672x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc148)
+        xten_nn.output %463 : tensor<1x672x12x20xbf16> loc(#loc148)
+      } -> tensor<1x672x12x20xbf16> loc(#loc148)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc148)
+    } -> tensor<1x672x12x20xbf16> loc(#loc148)
+    %291 = xten_nn.subgraph (%arg5 = %287: tensor<1x672x12x20xbf16>, %arg6 = %290: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_221",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_221",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x672x12x20xbf16>, %arg8 = %arg6: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Mul_221",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_221",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_221",
+          OutputName = "Mul_221",
+          shift = 0 : i8} : (tensor<1x672x12x20xbf16>, tensor<1x672x12x20xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc149)
+        xten_nn.output %462 : tensor<1x672x12x20xbf16> loc(#loc149)
+      } -> tensor<1x672x12x20xbf16> loc(#loc149)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc149)
+    } -> tensor<1x672x12x20xbf16> loc(#loc149)
+    %292 = xten_nn.subgraph (%arg5 = %291: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#30",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#31",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 240]> : vector<4xindex>
+        }
+      ],
+      Specializes = "Transpose4dAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dim_0 = 12 : ui32,
+        config.dim_1 = 84 : ui32,
+        config.dim_2 = 20 : ui32,
+        config.dim_3 = 8 : ui32,
+        config.dtype = "bfloat16",
+        config.perm = 6 : ui32
+      }} {
+      %461 = tosa.reshape %arg5 {new_shape = array<i64: 1, 672, 1, 240>} : (tensor<1x672x12x20xbf16>) -> tensor<1x672x1x240xbf16> loc(#loc335)
+      xten_nn.output %461 : tensor<1x672x1x240xbf16> loc(#loc335)
+    } -> tensor<1x672x1x240xbf16> loc(#loc335)
+    %293 = xten_nn.subgraph (%arg5 = %292: tensor<1x672x1x240xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#32",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 240]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#33",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x1x240xbf16>)  attributes {
+        LayerName = "Generated-#32",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 240]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Generated-#33",
+        PadValue = 0.000000e+00 : bf16,
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ReduceMeanC8Bf16",
+        Traits = {
+          Reduce = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.full_channel = 672 : ui32,
+          config.full_height = 1 : ui32,
+          config.full_width = 240 : ui32,
+          config.reduce_dim = "W"
+        }} {
+        %462 = xten_nn.reduce_mean %arg6 {axes = array<i64: 3>, keepdims = 1 : i64} : (tensor<1x672x1x240xbf16>) -> tensor<1x672x1x1xbf16> loc(#loc150)
+        xten_nn.output %462 : tensor<1x672x1x1xbf16> loc(#loc150)
+      } -> tensor<1x672x1x1xbf16> loc(#loc150)
+      xten_nn.output %461 : tensor<1x672x1x1xbf16> loc(#loc150)
+    } -> tensor<1x672x1x1xbf16> loc(#loc150)
+    %294 = xten_nn.subgraph (%arg5 = %293: tensor<1x672x1x1xbf16>, %arg6 = %73: tensor<168x672x1x1xbf16>, %arg7 = %72: tensor<168xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_223",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[168, 672, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_224",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 168, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x672x1x1xbf16>, %arg9 = %arg6: tensor<168x672x1x1xbf16>, %arg10 = %arg7: tensor<168xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_223",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[168, 672, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_224",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 168, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 168, 1, 1, 672>} : (tensor<168x672x1x1xbf16>) -> tensor<168x1x1x672xbf16> loc(#loc336)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 672>} : (tensor<1x672x1x1xbf16>) -> tensor<1x1x1x672xbf16> loc(#loc336)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_223",
+          PartOfOutputName = "Conv_223",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x672xbf16>, tensor<168x1x1x672xbf16>, tensor<168xbf16>) -> tensor<1x1x1x168xbf16> loc(#loc151)
+        %465 = tosa.clamp %464 {
+          LayerName = "Relu_224",
+          OutputName = "Relu_224",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x1x1x168xbf16>) -> tensor<1x1x1x168xbf16> loc(#loc152)
+        %466 = tosa.reshape %465 {new_shape = array<i64: 1, 168, 1, 1>} : (tensor<1x1x1x168xbf16>) -> tensor<1x168x1x1xbf16> loc(#loc336)
+        xten_nn.output %466 : tensor<1x168x1x1xbf16> loc(#loc152)
+      } -> tensor<1x168x1x1xbf16> loc(#loc336)
+      xten_nn.output %461 : tensor<1x168x1x1xbf16> loc(#loc336)
+    } -> tensor<1x168x1x1xbf16> loc(#loc336)
+    %295 = xten_nn.subgraph (%arg5 = %294: tensor<1x168x1x1xbf16>, %arg6 = %71: tensor<672x168x1x1xbf16>, %arg7 = %70: tensor<672xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_225",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 168, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[672, 168, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_225",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x168x1x1xbf16>, %arg9 = %arg6: tensor<672x168x1x1xbf16>, %arg10 = %arg7: tensor<672xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_225",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 168, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[672, 168, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_225",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 672, 1, 1, 168>} : (tensor<672x168x1x1xbf16>) -> tensor<672x1x1x168xbf16> loc(#loc153)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 168>} : (tensor<1x168x1x1xbf16>) -> tensor<1x1x1x168xbf16> loc(#loc153)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_225",
+          PartOfOutputName = "Conv_225",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x168xbf16>, tensor<672x1x1x168xbf16>, tensor<672xbf16>) -> tensor<1x1x1x672xbf16> loc(#loc153)
+        %465 = tosa.reshape %464 {new_shape = array<i64: 1, 672, 1, 1>} : (tensor<1x1x1x672xbf16>) -> tensor<1x672x1x1xbf16> loc(#loc153)
+        xten_nn.output %465 : tensor<1x672x1x1xbf16> loc(#loc153)
+      } -> tensor<1x672x1x1xbf16> loc(#loc153)
+      xten_nn.output %461 : tensor<1x672x1x1xbf16> loc(#loc153)
+    } -> tensor<1x672x1x1xbf16> loc(#loc153)
+    %296 = xten_nn.subgraph (%arg5 = %295: tensor<1x672x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_227",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_227",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x1x1xbf16>)  attributes {
+        LayerName = "Add_227",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_227",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_227", OutputName = "Add_227"} : (tensor<1x672x1x1xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x672x1x1xbf16> loc(#loc154)
+        xten_nn.output %463 : tensor<1x672x1x1xbf16> loc(#loc154)
+      } -> tensor<1x672x1x1xbf16> loc(#loc154)
+      xten_nn.output %461 : tensor<1x672x1x1xbf16> loc(#loc154)
+    } -> tensor<1x672x1x1xbf16> loc(#loc154)
+    %297 = xten_nn.subgraph (%arg5 = %296: tensor<1x672x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_230",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_230",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x1x1xbf16>)  attributes {
+        LayerName = "Clip_230",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_230",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_230",
+          OutputName = "Clip_230",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x672x1x1xbf16>) -> tensor<1x672x1x1xbf16> loc(#loc155)
+        xten_nn.output %462 : tensor<1x672x1x1xbf16> loc(#loc155)
+      } -> tensor<1x672x1x1xbf16> loc(#loc155)
+      xten_nn.output %461 : tensor<1x672x1x1xbf16> loc(#loc155)
+    } -> tensor<1x672x1x1xbf16> loc(#loc155)
+    %298 = xten_nn.subgraph (%arg5 = %297: tensor<1x672x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_232",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_232",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x1x1xbf16>)  attributes {
+        LayerName = "Div_232",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_232",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_232",
+          OutputName = "Div_232",
+          shift = 0 : i8} : (tensor<1x672x1x1xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x672x1x1xbf16> loc(#loc156)
+        xten_nn.output %463 : tensor<1x672x1x1xbf16> loc(#loc156)
+      } -> tensor<1x672x1x1xbf16> loc(#loc156)
+      xten_nn.output %461 : tensor<1x672x1x1xbf16> loc(#loc156)
+    } -> tensor<1x672x1x1xbf16> loc(#loc156)
+    %299 = xten_nn.subgraph (%arg5 = %298: tensor<1x672x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#34",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#35",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      Specializes = "TileAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dtype = "bfloat16",
+        config.i_dim_c = 672 : ui32,
+        config.i_dim_h = 1 : ui32,
+        config.i_dim_n = 1 : ui32,
+        config.i_dim_w = 1 : ui32,
+        config.rep_dim_c = 1 : ui32,
+        config.rep_dim_h = 12 : ui32,
+        config.rep_dim_w = 20 : ui32
+      }} {
+      %461 = tosa.tile %arg5 {multiples = array<i64: 1, 1, 12, 20>} : (tensor<1x672x1x1xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc157)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc157)
+    } -> tensor<1x672x12x20xbf16> loc(#loc157)
+    %300 = xten_nn.subgraph (%arg5 = %299: tensor<1x672x12x20xbf16>, %arg6 = %291: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_233",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_233",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x672x12x20xbf16>, %arg8 = %arg6: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Mul_233",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_233",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_233",
+          OutputName = "Mul_233",
+          shift = 0 : i8} : (tensor<1x672x12x20xbf16>, tensor<1x672x12x20xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc157)
+        xten_nn.output %462 : tensor<1x672x12x20xbf16> loc(#loc157)
+      } -> tensor<1x672x12x20xbf16> loc(#loc157)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc157)
+    } -> tensor<1x672x12x20xbf16> loc(#loc157)
+    %301 = xten_nn.subgraph (%arg5 = %300: tensor<1x672x12x20xbf16>, %arg6 = %69: tensor<112x672x1x1xbf16>, %arg7 = %68: tensor<112xbf16>, %arg8 = %281: tensor<1x112x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 3 : index],
+      LayerName = "Conv_234",
+      OfmShare = 3 : index,
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[112, 672, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 112, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_235",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 112, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg9 = %arg5: tensor<1x672x12x20xbf16>, %arg10 = %arg6: tensor<112x672x1x1xbf16>, %arg11 = %arg7: tensor<112xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_234",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[112, 672, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_234",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 112, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %463 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc158)
+        %465 = tosa.reshape %arg10 {new_shape = array<i64: 112, 1, 1, 672>} : (tensor<112x672x1x1xbf16>) -> tensor<112x1x1x672xbf16> loc(#loc158)
+        %466 = tosa.transpose %arg9, %464 : (tensor<1x672x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x672xbf16> loc(#loc158)
+        %467 = tosa.conv2d %466, %465, %arg11 {
+          PartOfLayerName = "Conv_234",
+          PartOfOutputName = "Conv_234",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x672xbf16>, tensor<112x1x1x672xbf16>, tensor<112xbf16>) -> tensor<1x12x20x112xbf16> loc(#loc158)
+        %468 = tosa.transpose %467, %463 : (tensor<1x12x20x112xbf16>, tensor<4xi32>) -> tensor<1x112x12x20xbf16> loc(#loc158)
+        xten_nn.output %468 : tensor<1x112x12x20xbf16> loc(#loc158)
+      } -> tensor<1x112x12x20xbf16> loc(#loc158)
+      %462 = xten_nn.subgraph (%arg9 = %461: tensor<1x112x12x20xbf16>, %arg10 = %arg8: tensor<1x112x12x20xbf16>)  attributes {
+        LayerName = "Add_235",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 112, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 112, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_235",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 112, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.act = 0 : ui8,
+          config.act_type = "LINEAR",
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %463 = tosa.add %arg9, %arg10 {LayerName = "Add_235", OutputName = "Add_235"} : (tensor<1x112x12x20xbf16>, tensor<1x112x12x20xbf16>) -> tensor<1x112x12x20xbf16> loc(#loc159)
+        xten_nn.output %463 : tensor<1x112x12x20xbf16> loc(#loc159)
+      } -> tensor<1x112x12x20xbf16> loc(#loc159)
+      xten_nn.output %462 : tensor<1x112x12x20xbf16> loc(#loc159)
+    } -> tensor<1x112x12x20xbf16> loc(#loc337)
+    %302 = xten_nn.subgraph (%arg5 = %301: tensor<1x112x12x20xbf16>, %arg6 = %67: tensor<672x112x1x1xbf16>, %arg7 = %66: tensor<672xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_236",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 112, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[672, 112, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_236",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x112x12x20xbf16>, %arg9 = %arg6: tensor<672x112x1x1xbf16>, %arg10 = %arg7: tensor<672xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_236",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 112, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[672, 112, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_236",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc160)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 672, 1, 1, 112>} : (tensor<672x112x1x1xbf16>) -> tensor<672x1x1x112xbf16> loc(#loc160)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x112x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x112xbf16> loc(#loc160)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_236",
+          PartOfOutputName = "Conv_236",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x112xbf16>, tensor<672x1x1x112xbf16>, tensor<672xbf16>) -> tensor<1x12x20x672xbf16> loc(#loc160)
+        %467 = tosa.transpose %466, %462 : (tensor<1x12x20x672xbf16>, tensor<4xi32>) -> tensor<1x672x12x20xbf16> loc(#loc160)
+        xten_nn.output %467 : tensor<1x672x12x20xbf16> loc(#loc160)
+      } -> tensor<1x672x12x20xbf16> loc(#loc160)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc160)
+    } -> tensor<1x672x12x20xbf16> loc(#loc160)
+    %303 = xten_nn.subgraph (%arg5 = %302: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_238",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_238",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Add_238",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_238",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_238", OutputName = "Add_238"} : (tensor<1x672x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc161)
+        xten_nn.output %463 : tensor<1x672x12x20xbf16> loc(#loc161)
+      } -> tensor<1x672x12x20xbf16> loc(#loc161)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc161)
+    } -> tensor<1x672x12x20xbf16> loc(#loc161)
+    %304 = xten_nn.subgraph (%arg5 = %303: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_241",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_241",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Clip_241",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_241",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_241",
+          OutputName = "Clip_241",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x672x12x20xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc162)
+        xten_nn.output %462 : tensor<1x672x12x20xbf16> loc(#loc162)
+      } -> tensor<1x672x12x20xbf16> loc(#loc162)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc162)
+    } -> tensor<1x672x12x20xbf16> loc(#loc162)
+    %305 = xten_nn.subgraph (%arg5 = %304: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_243",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_243",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Div_243",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_243",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_243",
+          OutputName = "Div_243",
+          shift = 0 : i8} : (tensor<1x672x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc163)
+        xten_nn.output %463 : tensor<1x672x12x20xbf16> loc(#loc163)
+      } -> tensor<1x672x12x20xbf16> loc(#loc163)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc163)
+    } -> tensor<1x672x12x20xbf16> loc(#loc163)
+    %306 = xten_nn.subgraph (%arg5 = %302: tensor<1x672x12x20xbf16>, %arg6 = %305: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_244",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_244",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x672x12x20xbf16>, %arg8 = %arg6: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Mul_244",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_244",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_244",
+          OutputName = "Mul_244",
+          shift = 0 : i8} : (tensor<1x672x12x20xbf16>, tensor<1x672x12x20xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc164)
+        xten_nn.output %462 : tensor<1x672x12x20xbf16> loc(#loc164)
+      } -> tensor<1x672x12x20xbf16> loc(#loc164)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc164)
+    } -> tensor<1x672x12x20xbf16> loc(#loc164)
+    %307 = xten_nn.subgraph (%arg5 = %306: tensor<1x672x12x20xbf16>, %arg6 = %65: tensor<672x1x9x9xbf16>, %arg7 = %64: tensor<672xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_245",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "CMHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[672, 1, 9, 9]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_245",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x672x12x20xbf16>, %arg9 = %arg6: tensor<672x1x9x9xbf16>, %arg10 = %arg7: tensor<672xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[4, 4], [4, 4]],
+        LayerName = "Conv_245",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "CMHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[672, 1, 9, 9]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_245",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "DepthwiseConv2dBf16",
+        With = {
+          config.act = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.kernel_height = 9 : ui8,
+          config.kernel_width = 9 : ui8,
+          config.stride = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[2, 3, 0, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc165)
+        %465 = tosa.transpose %arg9, %464 : (tensor<672x1x9x9xbf16>, tensor<4xi32>) -> tensor<9x9x672x1xbf16> loc(#loc165)
+        %466 = tosa.transpose %arg8, %463 : (tensor<1x672x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x672xbf16> loc(#loc165)
+        %467 = tosa.depthwise_conv2d %466, %465, %arg10 {
+          PartOfLayerName = "Conv_245",
+          PartOfOutputName = "Conv_245",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 4, 4, 4, 4>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x672xbf16>, tensor<9x9x672x1xbf16>, tensor<672xbf16>) -> tensor<1x12x20x672xbf16> loc(#loc165)
+        %468 = tosa.transpose %467, %462 : (tensor<1x12x20x672xbf16>, tensor<4xi32>) -> tensor<1x672x12x20xbf16> loc(#loc165)
+        xten_nn.output %468 : tensor<1x672x12x20xbf16> loc(#loc165)
+      } -> tensor<1x672x12x20xbf16> loc(#loc165)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc165)
+    } -> tensor<1x672x12x20xbf16> loc(#loc165)
+    %308 = xten_nn.subgraph (%arg5 = %307: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_247",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_247",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Add_247",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_247",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_247", OutputName = "Add_247"} : (tensor<1x672x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc166)
+        xten_nn.output %463 : tensor<1x672x12x20xbf16> loc(#loc166)
+      } -> tensor<1x672x12x20xbf16> loc(#loc166)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc166)
+    } -> tensor<1x672x12x20xbf16> loc(#loc166)
+    %309 = xten_nn.subgraph (%arg5 = %308: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_250",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_250",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Clip_250",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_250",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_250",
+          OutputName = "Clip_250",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x672x12x20xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc167)
+        xten_nn.output %462 : tensor<1x672x12x20xbf16> loc(#loc167)
+      } -> tensor<1x672x12x20xbf16> loc(#loc167)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc167)
+    } -> tensor<1x672x12x20xbf16> loc(#loc167)
+    %310 = xten_nn.subgraph (%arg5 = %309: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_252",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_252",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Div_252",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_252",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_252",
+          OutputName = "Div_252",
+          shift = 0 : i8} : (tensor<1x672x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc168)
+        xten_nn.output %463 : tensor<1x672x12x20xbf16> loc(#loc168)
+      } -> tensor<1x672x12x20xbf16> loc(#loc168)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc168)
+    } -> tensor<1x672x12x20xbf16> loc(#loc168)
+    %311 = xten_nn.subgraph (%arg5 = %307: tensor<1x672x12x20xbf16>, %arg6 = %310: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_253",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_253",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x672x12x20xbf16>, %arg8 = %arg6: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Mul_253",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_253",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_253",
+          OutputName = "Mul_253",
+          shift = 0 : i8} : (tensor<1x672x12x20xbf16>, tensor<1x672x12x20xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc169)
+        xten_nn.output %462 : tensor<1x672x12x20xbf16> loc(#loc169)
+      } -> tensor<1x672x12x20xbf16> loc(#loc169)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc169)
+    } -> tensor<1x672x12x20xbf16> loc(#loc169)
+    %312 = xten_nn.subgraph (%arg5 = %311: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#36",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#37",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 240]> : vector<4xindex>
+        }
+      ],
+      Specializes = "Transpose4dAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dim_0 = 12 : ui32,
+        config.dim_1 = 84 : ui32,
+        config.dim_2 = 20 : ui32,
+        config.dim_3 = 8 : ui32,
+        config.dtype = "bfloat16",
+        config.perm = 6 : ui32
+      }} {
+      %461 = tosa.reshape %arg5 {new_shape = array<i64: 1, 672, 1, 240>} : (tensor<1x672x12x20xbf16>) -> tensor<1x672x1x240xbf16> loc(#loc338)
+      xten_nn.output %461 : tensor<1x672x1x240xbf16> loc(#loc338)
+    } -> tensor<1x672x1x240xbf16> loc(#loc338)
+    %313 = xten_nn.subgraph (%arg5 = %312: tensor<1x672x1x240xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#38",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 240]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#39",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x1x240xbf16>)  attributes {
+        LayerName = "Generated-#38",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 240]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Generated-#39",
+        PadValue = 0.000000e+00 : bf16,
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ReduceMeanC8Bf16",
+        Traits = {
+          Reduce = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.full_channel = 672 : ui32,
+          config.full_height = 1 : ui32,
+          config.full_width = 240 : ui32,
+          config.reduce_dim = "W"
+        }} {
+        %462 = xten_nn.reduce_mean %arg6 {axes = array<i64: 3>, keepdims = 1 : i64} : (tensor<1x672x1x240xbf16>) -> tensor<1x672x1x1xbf16> loc(#loc170)
+        xten_nn.output %462 : tensor<1x672x1x1xbf16> loc(#loc170)
+      } -> tensor<1x672x1x1xbf16> loc(#loc170)
+      xten_nn.output %461 : tensor<1x672x1x1xbf16> loc(#loc170)
+    } -> tensor<1x672x1x1xbf16> loc(#loc170)
+    %314 = xten_nn.subgraph (%arg5 = %313: tensor<1x672x1x1xbf16>, %arg6 = %63: tensor<168x672x1x1xbf16>, %arg7 = %62: tensor<168xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_255",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[168, 672, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_256",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 168, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x672x1x1xbf16>, %arg9 = %arg6: tensor<168x672x1x1xbf16>, %arg10 = %arg7: tensor<168xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_255",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[168, 672, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_256",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 168, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 168, 1, 1, 672>} : (tensor<168x672x1x1xbf16>) -> tensor<168x1x1x672xbf16> loc(#loc339)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 672>} : (tensor<1x672x1x1xbf16>) -> tensor<1x1x1x672xbf16> loc(#loc339)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_255",
+          PartOfOutputName = "Conv_255",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x672xbf16>, tensor<168x1x1x672xbf16>, tensor<168xbf16>) -> tensor<1x1x1x168xbf16> loc(#loc171)
+        %465 = tosa.clamp %464 {
+          LayerName = "Relu_256",
+          OutputName = "Relu_256",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x1x1x168xbf16>) -> tensor<1x1x1x168xbf16> loc(#loc172)
+        %466 = tosa.reshape %465 {new_shape = array<i64: 1, 168, 1, 1>} : (tensor<1x1x1x168xbf16>) -> tensor<1x168x1x1xbf16> loc(#loc339)
+        xten_nn.output %466 : tensor<1x168x1x1xbf16> loc(#loc172)
+      } -> tensor<1x168x1x1xbf16> loc(#loc339)
+      xten_nn.output %461 : tensor<1x168x1x1xbf16> loc(#loc339)
+    } -> tensor<1x168x1x1xbf16> loc(#loc339)
+    %315 = xten_nn.subgraph (%arg5 = %314: tensor<1x168x1x1xbf16>, %arg6 = %61: tensor<672x168x1x1xbf16>, %arg7 = %60: tensor<672xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_257",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 168, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[672, 168, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_257",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x168x1x1xbf16>, %arg9 = %arg6: tensor<672x168x1x1xbf16>, %arg10 = %arg7: tensor<672xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_257",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 168, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[672, 168, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_257",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 672, 1, 1, 168>} : (tensor<672x168x1x1xbf16>) -> tensor<672x1x1x168xbf16> loc(#loc173)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 168>} : (tensor<1x168x1x1xbf16>) -> tensor<1x1x1x168xbf16> loc(#loc173)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_257",
+          PartOfOutputName = "Conv_257",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x168xbf16>, tensor<672x1x1x168xbf16>, tensor<672xbf16>) -> tensor<1x1x1x672xbf16> loc(#loc173)
+        %465 = tosa.reshape %464 {new_shape = array<i64: 1, 672, 1, 1>} : (tensor<1x1x1x672xbf16>) -> tensor<1x672x1x1xbf16> loc(#loc173)
+        xten_nn.output %465 : tensor<1x672x1x1xbf16> loc(#loc173)
+      } -> tensor<1x672x1x1xbf16> loc(#loc173)
+      xten_nn.output %461 : tensor<1x672x1x1xbf16> loc(#loc173)
+    } -> tensor<1x672x1x1xbf16> loc(#loc173)
+    %316 = xten_nn.subgraph (%arg5 = %315: tensor<1x672x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_259",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_259",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x1x1xbf16>)  attributes {
+        LayerName = "Add_259",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_259",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_259", OutputName = "Add_259"} : (tensor<1x672x1x1xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x672x1x1xbf16> loc(#loc174)
+        xten_nn.output %463 : tensor<1x672x1x1xbf16> loc(#loc174)
+      } -> tensor<1x672x1x1xbf16> loc(#loc174)
+      xten_nn.output %461 : tensor<1x672x1x1xbf16> loc(#loc174)
+    } -> tensor<1x672x1x1xbf16> loc(#loc174)
+    %317 = xten_nn.subgraph (%arg5 = %316: tensor<1x672x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_262",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_262",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x1x1xbf16>)  attributes {
+        LayerName = "Clip_262",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_262",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_262",
+          OutputName = "Clip_262",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x672x1x1xbf16>) -> tensor<1x672x1x1xbf16> loc(#loc175)
+        xten_nn.output %462 : tensor<1x672x1x1xbf16> loc(#loc175)
+      } -> tensor<1x672x1x1xbf16> loc(#loc175)
+      xten_nn.output %461 : tensor<1x672x1x1xbf16> loc(#loc175)
+    } -> tensor<1x672x1x1xbf16> loc(#loc175)
+    %318 = xten_nn.subgraph (%arg5 = %317: tensor<1x672x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_264",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_264",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x672x1x1xbf16>)  attributes {
+        LayerName = "Div_264",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_264",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_264",
+          OutputName = "Div_264",
+          shift = 0 : i8} : (tensor<1x672x1x1xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x672x1x1xbf16> loc(#loc176)
+        xten_nn.output %463 : tensor<1x672x1x1xbf16> loc(#loc176)
+      } -> tensor<1x672x1x1xbf16> loc(#loc176)
+      xten_nn.output %461 : tensor<1x672x1x1xbf16> loc(#loc176)
+    } -> tensor<1x672x1x1xbf16> loc(#loc176)
+    %319 = xten_nn.subgraph (%arg5 = %318: tensor<1x672x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#40",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#41",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      Specializes = "TileAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dtype = "bfloat16",
+        config.i_dim_c = 672 : ui32,
+        config.i_dim_h = 1 : ui32,
+        config.i_dim_n = 1 : ui32,
+        config.i_dim_w = 1 : ui32,
+        config.rep_dim_c = 1 : ui32,
+        config.rep_dim_h = 12 : ui32,
+        config.rep_dim_w = 20 : ui32
+      }} {
+      %461 = tosa.tile %arg5 {multiples = array<i64: 1, 1, 12, 20>} : (tensor<1x672x1x1xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc177)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc177)
+    } -> tensor<1x672x12x20xbf16> loc(#loc177)
+    %320 = xten_nn.subgraph (%arg5 = %319: tensor<1x672x12x20xbf16>, %arg6 = %311: tensor<1x672x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_265",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_265",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x672x12x20xbf16>, %arg8 = %arg6: tensor<1x672x12x20xbf16>)  attributes {
+        LayerName = "Mul_265",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_265",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_265",
+          OutputName = "Mul_265",
+          shift = 0 : i8} : (tensor<1x672x12x20xbf16>, tensor<1x672x12x20xbf16>) -> tensor<1x672x12x20xbf16> loc(#loc177)
+        xten_nn.output %462 : tensor<1x672x12x20xbf16> loc(#loc177)
+      } -> tensor<1x672x12x20xbf16> loc(#loc177)
+      xten_nn.output %461 : tensor<1x672x12x20xbf16> loc(#loc177)
+    } -> tensor<1x672x12x20xbf16> loc(#loc177)
+    %321 = xten_nn.subgraph (%arg5 = %320: tensor<1x672x12x20xbf16>, %arg6 = %59: tensor<160x672x1x1xbf16>, %arg7 = %58: tensor<160xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_266",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[160, 672, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_266",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x672x12x20xbf16>, %arg9 = %arg6: tensor<160x672x1x1xbf16>, %arg10 = %arg7: tensor<160xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_266",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 672, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[160, 672, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_266",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc178)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 160, 1, 1, 672>} : (tensor<160x672x1x1xbf16>) -> tensor<160x1x1x672xbf16> loc(#loc178)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x672x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x672xbf16> loc(#loc178)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_266",
+          PartOfOutputName = "Conv_266",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x672xbf16>, tensor<160x1x1x672xbf16>, tensor<160xbf16>) -> tensor<1x12x20x160xbf16> loc(#loc178)
+        %467 = tosa.transpose %466, %462 : (tensor<1x12x20x160xbf16>, tensor<4xi32>) -> tensor<1x160x12x20xbf16> loc(#loc178)
+        xten_nn.output %467 : tensor<1x160x12x20xbf16> loc(#loc178)
+      } -> tensor<1x160x12x20xbf16> loc(#loc178)
+      xten_nn.output %461 : tensor<1x160x12x20xbf16> loc(#loc178)
+    } -> tensor<1x160x12x20xbf16> loc(#loc178)
+    %322 = xten_nn.subgraph (%arg5 = %321: tensor<1x160x12x20xbf16>, %arg6 = %57: tensor<960x160x1x1xbf16>, %arg7 = %56: tensor<960xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_267",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[960, 160, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_267",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x160x12x20xbf16>, %arg9 = %arg6: tensor<960x160x1x1xbf16>, %arg10 = %arg7: tensor<960xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_267",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[960, 160, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_267",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc179)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 960, 1, 1, 160>} : (tensor<960x160x1x1xbf16>) -> tensor<960x1x1x160xbf16> loc(#loc179)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x160x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x160xbf16> loc(#loc179)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_267",
+          PartOfOutputName = "Conv_267",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x160xbf16>, tensor<960x1x1x160xbf16>, tensor<960xbf16>) -> tensor<1x12x20x960xbf16> loc(#loc179)
+        %467 = tosa.transpose %466, %462 : (tensor<1x12x20x960xbf16>, tensor<4xi32>) -> tensor<1x960x12x20xbf16> loc(#loc179)
+        xten_nn.output %467 : tensor<1x960x12x20xbf16> loc(#loc179)
+      } -> tensor<1x960x12x20xbf16> loc(#loc179)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc179)
+    } -> tensor<1x960x12x20xbf16> loc(#loc179)
+    %323 = xten_nn.subgraph (%arg5 = %322: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_269",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_269",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Add_269",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_269",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_269", OutputName = "Add_269"} : (tensor<1x960x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc180)
+        xten_nn.output %463 : tensor<1x960x12x20xbf16> loc(#loc180)
+      } -> tensor<1x960x12x20xbf16> loc(#loc180)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc180)
+    } -> tensor<1x960x12x20xbf16> loc(#loc180)
+    %324 = xten_nn.subgraph (%arg5 = %323: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_272",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_272",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Clip_272",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_272",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_272",
+          OutputName = "Clip_272",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x960x12x20xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc181)
+        xten_nn.output %462 : tensor<1x960x12x20xbf16> loc(#loc181)
+      } -> tensor<1x960x12x20xbf16> loc(#loc181)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc181)
+    } -> tensor<1x960x12x20xbf16> loc(#loc181)
+    %325 = xten_nn.subgraph (%arg5 = %324: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_274",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_274",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Div_274",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_274",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_274",
+          OutputName = "Div_274",
+          shift = 0 : i8} : (tensor<1x960x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc182)
+        xten_nn.output %463 : tensor<1x960x12x20xbf16> loc(#loc182)
+      } -> tensor<1x960x12x20xbf16> loc(#loc182)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc182)
+    } -> tensor<1x960x12x20xbf16> loc(#loc182)
+    %326 = xten_nn.subgraph (%arg5 = %322: tensor<1x960x12x20xbf16>, %arg6 = %325: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_275",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_275",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x960x12x20xbf16>, %arg8 = %arg6: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Mul_275",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_275",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_275",
+          OutputName = "Mul_275",
+          shift = 0 : i8} : (tensor<1x960x12x20xbf16>, tensor<1x960x12x20xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc183)
+        xten_nn.output %462 : tensor<1x960x12x20xbf16> loc(#loc183)
+      } -> tensor<1x960x12x20xbf16> loc(#loc183)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc183)
+    } -> tensor<1x960x12x20xbf16> loc(#loc183)
+    %327 = xten_nn.subgraph (%arg5 = %326: tensor<1x960x12x20xbf16>, %arg6 = %55: tensor<960x1x9x9xbf16>, %arg7 = %54: tensor<960xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_276",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "CMHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[960, 1, 9, 9]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_276",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x960x12x20xbf16>, %arg9 = %arg6: tensor<960x1x9x9xbf16>, %arg10 = %arg7: tensor<960xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[4, 4], [4, 4]],
+        LayerName = "Conv_276",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "CMHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[960, 1, 9, 9]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_276",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "DepthwiseConv2dBf16",
+        With = {
+          config.act = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.kernel_height = 9 : ui8,
+          config.kernel_width = 9 : ui8,
+          config.stride = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[2, 3, 0, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc184)
+        %465 = tosa.transpose %arg9, %464 : (tensor<960x1x9x9xbf16>, tensor<4xi32>) -> tensor<9x9x960x1xbf16> loc(#loc184)
+        %466 = tosa.transpose %arg8, %463 : (tensor<1x960x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x960xbf16> loc(#loc184)
+        %467 = tosa.depthwise_conv2d %466, %465, %arg10 {
+          PartOfLayerName = "Conv_276",
+          PartOfOutputName = "Conv_276",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 4, 4, 4, 4>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x960xbf16>, tensor<9x9x960x1xbf16>, tensor<960xbf16>) -> tensor<1x12x20x960xbf16> loc(#loc184)
+        %468 = tosa.transpose %467, %462 : (tensor<1x12x20x960xbf16>, tensor<4xi32>) -> tensor<1x960x12x20xbf16> loc(#loc184)
+        xten_nn.output %468 : tensor<1x960x12x20xbf16> loc(#loc184)
+      } -> tensor<1x960x12x20xbf16> loc(#loc184)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc184)
+    } -> tensor<1x960x12x20xbf16> loc(#loc184)
+    %328 = xten_nn.subgraph (%arg5 = %327: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_278",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_278",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Add_278",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_278",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_278", OutputName = "Add_278"} : (tensor<1x960x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc185)
+        xten_nn.output %463 : tensor<1x960x12x20xbf16> loc(#loc185)
+      } -> tensor<1x960x12x20xbf16> loc(#loc185)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc185)
+    } -> tensor<1x960x12x20xbf16> loc(#loc185)
+    %329 = xten_nn.subgraph (%arg5 = %328: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_281",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_281",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Clip_281",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_281",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_281",
+          OutputName = "Clip_281",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x960x12x20xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc186)
+        xten_nn.output %462 : tensor<1x960x12x20xbf16> loc(#loc186)
+      } -> tensor<1x960x12x20xbf16> loc(#loc186)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc186)
+    } -> tensor<1x960x12x20xbf16> loc(#loc186)
+    %330 = xten_nn.subgraph (%arg5 = %329: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_283",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_283",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Div_283",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_283",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_283",
+          OutputName = "Div_283",
+          shift = 0 : i8} : (tensor<1x960x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc187)
+        xten_nn.output %463 : tensor<1x960x12x20xbf16> loc(#loc187)
+      } -> tensor<1x960x12x20xbf16> loc(#loc187)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc187)
+    } -> tensor<1x960x12x20xbf16> loc(#loc187)
+    %331 = xten_nn.subgraph (%arg5 = %327: tensor<1x960x12x20xbf16>, %arg6 = %330: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_284",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_284",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x960x12x20xbf16>, %arg8 = %arg6: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Mul_284",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_284",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_284",
+          OutputName = "Mul_284",
+          shift = 0 : i8} : (tensor<1x960x12x20xbf16>, tensor<1x960x12x20xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc188)
+        xten_nn.output %462 : tensor<1x960x12x20xbf16> loc(#loc188)
+      } -> tensor<1x960x12x20xbf16> loc(#loc188)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc188)
+    } -> tensor<1x960x12x20xbf16> loc(#loc188)
+    %332 = xten_nn.subgraph (%arg5 = %331: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#42",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#43",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 240]> : vector<4xindex>
+        }
+      ],
+      Specializes = "Transpose4dAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dim_0 = 12 : ui32,
+        config.dim_1 = 120 : ui32,
+        config.dim_2 = 20 : ui32,
+        config.dim_3 = 8 : ui32,
+        config.dtype = "bfloat16",
+        config.perm = 6 : ui32
+      }} {
+      %461 = tosa.reshape %arg5 {new_shape = array<i64: 1, 960, 1, 240>} : (tensor<1x960x12x20xbf16>) -> tensor<1x960x1x240xbf16> loc(#loc340)
+      xten_nn.output %461 : tensor<1x960x1x240xbf16> loc(#loc340)
+    } -> tensor<1x960x1x240xbf16> loc(#loc340)
+    %333 = xten_nn.subgraph (%arg5 = %332: tensor<1x960x1x240xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#44",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 240]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#45",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x1x240xbf16>)  attributes {
+        LayerName = "Generated-#44",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 240]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Generated-#45",
+        PadValue = 0.000000e+00 : bf16,
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ReduceMeanC8Bf16",
+        Traits = {
+          Reduce = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.full_channel = 960 : ui32,
+          config.full_height = 1 : ui32,
+          config.full_width = 240 : ui32,
+          config.reduce_dim = "W"
+        }} {
+        %462 = xten_nn.reduce_mean %arg6 {axes = array<i64: 3>, keepdims = 1 : i64} : (tensor<1x960x1x240xbf16>) -> tensor<1x960x1x1xbf16> loc(#loc189)
+        xten_nn.output %462 : tensor<1x960x1x1xbf16> loc(#loc189)
+      } -> tensor<1x960x1x1xbf16> loc(#loc189)
+      xten_nn.output %461 : tensor<1x960x1x1xbf16> loc(#loc189)
+    } -> tensor<1x960x1x1xbf16> loc(#loc189)
+    %334 = xten_nn.subgraph (%arg5 = %333: tensor<1x960x1x1xbf16>, %arg6 = %53: tensor<240x960x1x1xbf16>, %arg7 = %52: tensor<240xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_286",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[240, 960, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_287",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x960x1x1xbf16>, %arg9 = %arg6: tensor<240x960x1x1xbf16>, %arg10 = %arg7: tensor<240xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_286",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[240, 960, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_287",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 240, 1, 1, 960>} : (tensor<240x960x1x1xbf16>) -> tensor<240x1x1x960xbf16> loc(#loc341)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 960>} : (tensor<1x960x1x1xbf16>) -> tensor<1x1x1x960xbf16> loc(#loc341)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_286",
+          PartOfOutputName = "Conv_286",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x960xbf16>, tensor<240x1x1x960xbf16>, tensor<240xbf16>) -> tensor<1x1x1x240xbf16> loc(#loc190)
+        %465 = tosa.clamp %464 {
+          LayerName = "Relu_287",
+          OutputName = "Relu_287",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x1x1x240xbf16>) -> tensor<1x1x1x240xbf16> loc(#loc191)
+        %466 = tosa.reshape %465 {new_shape = array<i64: 1, 240, 1, 1>} : (tensor<1x1x1x240xbf16>) -> tensor<1x240x1x1xbf16> loc(#loc341)
+        xten_nn.output %466 : tensor<1x240x1x1xbf16> loc(#loc191)
+      } -> tensor<1x240x1x1xbf16> loc(#loc341)
+      xten_nn.output %461 : tensor<1x240x1x1xbf16> loc(#loc341)
+    } -> tensor<1x240x1x1xbf16> loc(#loc341)
+    %335 = xten_nn.subgraph (%arg5 = %334: tensor<1x240x1x1xbf16>, %arg6 = %51: tensor<960x240x1x1xbf16>, %arg7 = %50: tensor<960xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_288",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[960, 240, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_288",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x240x1x1xbf16>, %arg9 = %arg6: tensor<960x240x1x1xbf16>, %arg10 = %arg7: tensor<960xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_288",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[960, 240, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_288",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 960, 1, 1, 240>} : (tensor<960x240x1x1xbf16>) -> tensor<960x1x1x240xbf16> loc(#loc192)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 240>} : (tensor<1x240x1x1xbf16>) -> tensor<1x1x1x240xbf16> loc(#loc192)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_288",
+          PartOfOutputName = "Conv_288",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x240xbf16>, tensor<960x1x1x240xbf16>, tensor<960xbf16>) -> tensor<1x1x1x960xbf16> loc(#loc192)
+        %465 = tosa.reshape %464 {new_shape = array<i64: 1, 960, 1, 1>} : (tensor<1x1x1x960xbf16>) -> tensor<1x960x1x1xbf16> loc(#loc192)
+        xten_nn.output %465 : tensor<1x960x1x1xbf16> loc(#loc192)
+      } -> tensor<1x960x1x1xbf16> loc(#loc192)
+      xten_nn.output %461 : tensor<1x960x1x1xbf16> loc(#loc192)
+    } -> tensor<1x960x1x1xbf16> loc(#loc192)
+    %336 = xten_nn.subgraph (%arg5 = %335: tensor<1x960x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_290",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_290",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x1x1xbf16>)  attributes {
+        LayerName = "Add_290",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_290",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_290", OutputName = "Add_290"} : (tensor<1x960x1x1xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x960x1x1xbf16> loc(#loc193)
+        xten_nn.output %463 : tensor<1x960x1x1xbf16> loc(#loc193)
+      } -> tensor<1x960x1x1xbf16> loc(#loc193)
+      xten_nn.output %461 : tensor<1x960x1x1xbf16> loc(#loc193)
+    } -> tensor<1x960x1x1xbf16> loc(#loc193)
+    %337 = xten_nn.subgraph (%arg5 = %336: tensor<1x960x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_293",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_293",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x1x1xbf16>)  attributes {
+        LayerName = "Clip_293",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_293",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_293",
+          OutputName = "Clip_293",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x960x1x1xbf16>) -> tensor<1x960x1x1xbf16> loc(#loc194)
+        xten_nn.output %462 : tensor<1x960x1x1xbf16> loc(#loc194)
+      } -> tensor<1x960x1x1xbf16> loc(#loc194)
+      xten_nn.output %461 : tensor<1x960x1x1xbf16> loc(#loc194)
+    } -> tensor<1x960x1x1xbf16> loc(#loc194)
+    %338 = xten_nn.subgraph (%arg5 = %337: tensor<1x960x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_295",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_295",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x1x1xbf16>)  attributes {
+        LayerName = "Div_295",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_295",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_295",
+          OutputName = "Div_295",
+          shift = 0 : i8} : (tensor<1x960x1x1xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x960x1x1xbf16> loc(#loc195)
+        xten_nn.output %463 : tensor<1x960x1x1xbf16> loc(#loc195)
+      } -> tensor<1x960x1x1xbf16> loc(#loc195)
+      xten_nn.output %461 : tensor<1x960x1x1xbf16> loc(#loc195)
+    } -> tensor<1x960x1x1xbf16> loc(#loc195)
+    %339 = xten_nn.subgraph (%arg5 = %338: tensor<1x960x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#46",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#47",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      Specializes = "TileAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dtype = "bfloat16",
+        config.i_dim_c = 960 : ui32,
+        config.i_dim_h = 1 : ui32,
+        config.i_dim_n = 1 : ui32,
+        config.i_dim_w = 1 : ui32,
+        config.rep_dim_c = 1 : ui32,
+        config.rep_dim_h = 12 : ui32,
+        config.rep_dim_w = 20 : ui32
+      }} {
+      %461 = tosa.tile %arg5 {multiples = array<i64: 1, 1, 12, 20>} : (tensor<1x960x1x1xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc196)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc196)
+    } -> tensor<1x960x12x20xbf16> loc(#loc196)
+    %340 = xten_nn.subgraph (%arg5 = %339: tensor<1x960x12x20xbf16>, %arg6 = %331: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_296",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_296",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x960x12x20xbf16>, %arg8 = %arg6: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Mul_296",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_296",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_296",
+          OutputName = "Mul_296",
+          shift = 0 : i8} : (tensor<1x960x12x20xbf16>, tensor<1x960x12x20xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc196)
+        xten_nn.output %462 : tensor<1x960x12x20xbf16> loc(#loc196)
+      } -> tensor<1x960x12x20xbf16> loc(#loc196)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc196)
+    } -> tensor<1x960x12x20xbf16> loc(#loc196)
+    %341 = xten_nn.subgraph (%arg5 = %340: tensor<1x960x12x20xbf16>, %arg6 = %49: tensor<160x960x1x1xbf16>, %arg7 = %48: tensor<160xbf16>, %arg8 = %321: tensor<1x160x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 3 : index],
+      LayerName = "Conv_297",
+      OfmShare = 3 : index,
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[160, 960, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_298",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg9 = %arg5: tensor<1x960x12x20xbf16>, %arg10 = %arg6: tensor<160x960x1x1xbf16>, %arg11 = %arg7: tensor<160xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_297",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[160, 960, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_297",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %463 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc197)
+        %465 = tosa.reshape %arg10 {new_shape = array<i64: 160, 1, 1, 960>} : (tensor<160x960x1x1xbf16>) -> tensor<160x1x1x960xbf16> loc(#loc197)
+        %466 = tosa.transpose %arg9, %464 : (tensor<1x960x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x960xbf16> loc(#loc197)
+        %467 = tosa.conv2d %466, %465, %arg11 {
+          PartOfLayerName = "Conv_297",
+          PartOfOutputName = "Conv_297",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x960xbf16>, tensor<160x1x1x960xbf16>, tensor<160xbf16>) -> tensor<1x12x20x160xbf16> loc(#loc197)
+        %468 = tosa.transpose %467, %463 : (tensor<1x12x20x160xbf16>, tensor<4xi32>) -> tensor<1x160x12x20xbf16> loc(#loc197)
+        xten_nn.output %468 : tensor<1x160x12x20xbf16> loc(#loc197)
+      } -> tensor<1x160x12x20xbf16> loc(#loc197)
+      %462 = xten_nn.subgraph (%arg9 = %461: tensor<1x160x12x20xbf16>, %arg10 = %arg8: tensor<1x160x12x20xbf16>)  attributes {
+        LayerName = "Add_298",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_298",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.act = 0 : ui8,
+          config.act_type = "LINEAR",
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %463 = tosa.add %arg9, %arg10 {LayerName = "Add_298", OutputName = "Add_298"} : (tensor<1x160x12x20xbf16>, tensor<1x160x12x20xbf16>) -> tensor<1x160x12x20xbf16> loc(#loc198)
+        xten_nn.output %463 : tensor<1x160x12x20xbf16> loc(#loc198)
+      } -> tensor<1x160x12x20xbf16> loc(#loc198)
+      xten_nn.output %462 : tensor<1x160x12x20xbf16> loc(#loc198)
+    } -> tensor<1x160x12x20xbf16> loc(#loc342)
+    %342 = xten_nn.subgraph (%arg5 = %341: tensor<1x160x12x20xbf16>, %arg6 = %47: tensor<960x160x1x1xbf16>, %arg7 = %46: tensor<960xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_299",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[960, 160, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_299",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x160x12x20xbf16>, %arg9 = %arg6: tensor<960x160x1x1xbf16>, %arg10 = %arg7: tensor<960xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_299",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[960, 160, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_299",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc199)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 960, 1, 1, 160>} : (tensor<960x160x1x1xbf16>) -> tensor<960x1x1x160xbf16> loc(#loc199)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x160x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x160xbf16> loc(#loc199)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_299",
+          PartOfOutputName = "Conv_299",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x160xbf16>, tensor<960x1x1x160xbf16>, tensor<960xbf16>) -> tensor<1x12x20x960xbf16> loc(#loc199)
+        %467 = tosa.transpose %466, %462 : (tensor<1x12x20x960xbf16>, tensor<4xi32>) -> tensor<1x960x12x20xbf16> loc(#loc199)
+        xten_nn.output %467 : tensor<1x960x12x20xbf16> loc(#loc199)
+      } -> tensor<1x960x12x20xbf16> loc(#loc199)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc199)
+    } -> tensor<1x960x12x20xbf16> loc(#loc199)
+    %343 = xten_nn.subgraph (%arg5 = %342: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_301",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_301",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Add_301",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_301",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_301", OutputName = "Add_301"} : (tensor<1x960x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc200)
+        xten_nn.output %463 : tensor<1x960x12x20xbf16> loc(#loc200)
+      } -> tensor<1x960x12x20xbf16> loc(#loc200)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc200)
+    } -> tensor<1x960x12x20xbf16> loc(#loc200)
+    %344 = xten_nn.subgraph (%arg5 = %343: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_304",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_304",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Clip_304",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_304",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_304",
+          OutputName = "Clip_304",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x960x12x20xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc201)
+        xten_nn.output %462 : tensor<1x960x12x20xbf16> loc(#loc201)
+      } -> tensor<1x960x12x20xbf16> loc(#loc201)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc201)
+    } -> tensor<1x960x12x20xbf16> loc(#loc201)
+    %345 = xten_nn.subgraph (%arg5 = %344: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_306",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_306",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Div_306",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_306",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_306",
+          OutputName = "Div_306",
+          shift = 0 : i8} : (tensor<1x960x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc202)
+        xten_nn.output %463 : tensor<1x960x12x20xbf16> loc(#loc202)
+      } -> tensor<1x960x12x20xbf16> loc(#loc202)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc202)
+    } -> tensor<1x960x12x20xbf16> loc(#loc202)
+    %346 = xten_nn.subgraph (%arg5 = %342: tensor<1x960x12x20xbf16>, %arg6 = %345: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_307",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_307",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x960x12x20xbf16>, %arg8 = %arg6: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Mul_307",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_307",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_307",
+          OutputName = "Mul_307",
+          shift = 0 : i8} : (tensor<1x960x12x20xbf16>, tensor<1x960x12x20xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc203)
+        xten_nn.output %462 : tensor<1x960x12x20xbf16> loc(#loc203)
+      } -> tensor<1x960x12x20xbf16> loc(#loc203)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc203)
+    } -> tensor<1x960x12x20xbf16> loc(#loc203)
+    %347 = xten_nn.subgraph (%arg5 = %346: tensor<1x960x12x20xbf16>, %arg6 = %45: tensor<960x1x9x9xbf16>, %arg7 = %44: tensor<960xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_308",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "CMHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[960, 1, 9, 9]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_308",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x960x12x20xbf16>, %arg9 = %arg6: tensor<960x1x9x9xbf16>, %arg10 = %arg7: tensor<960xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[4, 4], [4, 4]],
+        LayerName = "Conv_308",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "CMHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[960, 1, 9, 9]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_308",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "DepthwiseConv2dBf16",
+        With = {
+          config.act = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.kernel_height = 9 : ui8,
+          config.kernel_width = 9 : ui8,
+          config.stride = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[2, 3, 0, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc204)
+        %465 = tosa.transpose %arg9, %464 : (tensor<960x1x9x9xbf16>, tensor<4xi32>) -> tensor<9x9x960x1xbf16> loc(#loc204)
+        %466 = tosa.transpose %arg8, %463 : (tensor<1x960x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x960xbf16> loc(#loc204)
+        %467 = tosa.depthwise_conv2d %466, %465, %arg10 {
+          PartOfLayerName = "Conv_308",
+          PartOfOutputName = "Conv_308",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 4, 4, 4, 4>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x960xbf16>, tensor<9x9x960x1xbf16>, tensor<960xbf16>) -> tensor<1x12x20x960xbf16> loc(#loc204)
+        %468 = tosa.transpose %467, %462 : (tensor<1x12x20x960xbf16>, tensor<4xi32>) -> tensor<1x960x12x20xbf16> loc(#loc204)
+        xten_nn.output %468 : tensor<1x960x12x20xbf16> loc(#loc204)
+      } -> tensor<1x960x12x20xbf16> loc(#loc204)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc204)
+    } -> tensor<1x960x12x20xbf16> loc(#loc204)
+    %348 = xten_nn.subgraph (%arg5 = %347: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_310",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_310",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Add_310",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_310",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_310", OutputName = "Add_310"} : (tensor<1x960x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc205)
+        xten_nn.output %463 : tensor<1x960x12x20xbf16> loc(#loc205)
+      } -> tensor<1x960x12x20xbf16> loc(#loc205)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc205)
+    } -> tensor<1x960x12x20xbf16> loc(#loc205)
+    %349 = xten_nn.subgraph (%arg5 = %348: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_313",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_313",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Clip_313",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_313",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_313",
+          OutputName = "Clip_313",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x960x12x20xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc206)
+        xten_nn.output %462 : tensor<1x960x12x20xbf16> loc(#loc206)
+      } -> tensor<1x960x12x20xbf16> loc(#loc206)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc206)
+    } -> tensor<1x960x12x20xbf16> loc(#loc206)
+    %350 = xten_nn.subgraph (%arg5 = %349: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_315",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_315",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Div_315",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_315",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_315",
+          OutputName = "Div_315",
+          shift = 0 : i8} : (tensor<1x960x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc207)
+        xten_nn.output %463 : tensor<1x960x12x20xbf16> loc(#loc207)
+      } -> tensor<1x960x12x20xbf16> loc(#loc207)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc207)
+    } -> tensor<1x960x12x20xbf16> loc(#loc207)
+    %351 = xten_nn.subgraph (%arg5 = %347: tensor<1x960x12x20xbf16>, %arg6 = %350: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_316",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_316",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x960x12x20xbf16>, %arg8 = %arg6: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Mul_316",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_316",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_316",
+          OutputName = "Mul_316",
+          shift = 0 : i8} : (tensor<1x960x12x20xbf16>, tensor<1x960x12x20xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc208)
+        xten_nn.output %462 : tensor<1x960x12x20xbf16> loc(#loc208)
+      } -> tensor<1x960x12x20xbf16> loc(#loc208)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc208)
+    } -> tensor<1x960x12x20xbf16> loc(#loc208)
+    %352 = xten_nn.subgraph (%arg5 = %351: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#48",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#49",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 240]> : vector<4xindex>
+        }
+      ],
+      Specializes = "Transpose4dAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dim_0 = 12 : ui32,
+        config.dim_1 = 120 : ui32,
+        config.dim_2 = 20 : ui32,
+        config.dim_3 = 8 : ui32,
+        config.dtype = "bfloat16",
+        config.perm = 6 : ui32
+      }} {
+      %461 = tosa.reshape %arg5 {new_shape = array<i64: 1, 960, 1, 240>} : (tensor<1x960x12x20xbf16>) -> tensor<1x960x1x240xbf16> loc(#loc343)
+      xten_nn.output %461 : tensor<1x960x1x240xbf16> loc(#loc343)
+    } -> tensor<1x960x1x240xbf16> loc(#loc343)
+    %353 = xten_nn.subgraph (%arg5 = %352: tensor<1x960x1x240xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#50",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 240]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#51",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x1x240xbf16>)  attributes {
+        LayerName = "Generated-#50",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 240]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Generated-#51",
+        PadValue = 0.000000e+00 : bf16,
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ReduceMeanC8Bf16",
+        Traits = {
+          Reduce = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.full_channel = 960 : ui32,
+          config.full_height = 1 : ui32,
+          config.full_width = 240 : ui32,
+          config.reduce_dim = "W"
+        }} {
+        %462 = xten_nn.reduce_mean %arg6 {axes = array<i64: 3>, keepdims = 1 : i64} : (tensor<1x960x1x240xbf16>) -> tensor<1x960x1x1xbf16> loc(#loc209)
+        xten_nn.output %462 : tensor<1x960x1x1xbf16> loc(#loc209)
+      } -> tensor<1x960x1x1xbf16> loc(#loc209)
+      xten_nn.output %461 : tensor<1x960x1x1xbf16> loc(#loc209)
+    } -> tensor<1x960x1x1xbf16> loc(#loc209)
+    %354 = xten_nn.subgraph (%arg5 = %353: tensor<1x960x1x1xbf16>, %arg6 = %43: tensor<240x960x1x1xbf16>, %arg7 = %42: tensor<240xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_318",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[240, 960, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_319",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x960x1x1xbf16>, %arg9 = %arg6: tensor<240x960x1x1xbf16>, %arg10 = %arg7: tensor<240xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_318",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[240, 960, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_319",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 240, 1, 1, 960>} : (tensor<240x960x1x1xbf16>) -> tensor<240x1x1x960xbf16> loc(#loc344)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 960>} : (tensor<1x960x1x1xbf16>) -> tensor<1x1x1x960xbf16> loc(#loc344)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_318",
+          PartOfOutputName = "Conv_318",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x960xbf16>, tensor<240x1x1x960xbf16>, tensor<240xbf16>) -> tensor<1x1x1x240xbf16> loc(#loc210)
+        %465 = tosa.clamp %464 {
+          LayerName = "Relu_319",
+          OutputName = "Relu_319",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x1x1x240xbf16>) -> tensor<1x1x1x240xbf16> loc(#loc211)
+        %466 = tosa.reshape %465 {new_shape = array<i64: 1, 240, 1, 1>} : (tensor<1x1x1x240xbf16>) -> tensor<1x240x1x1xbf16> loc(#loc344)
+        xten_nn.output %466 : tensor<1x240x1x1xbf16> loc(#loc211)
+      } -> tensor<1x240x1x1xbf16> loc(#loc344)
+      xten_nn.output %461 : tensor<1x240x1x1xbf16> loc(#loc344)
+    } -> tensor<1x240x1x1xbf16> loc(#loc344)
+    %355 = xten_nn.subgraph (%arg5 = %354: tensor<1x240x1x1xbf16>, %arg6 = %41: tensor<960x240x1x1xbf16>, %arg7 = %40: tensor<960xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_320",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 240, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[960, 240, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_320",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x240x1x1xbf16>, %arg9 = %arg6: tensor<960x240x1x1xbf16>, %arg10 = %arg7: tensor<960xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_320",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 240, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[960, 240, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_320",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 960, 1, 1, 240>} : (tensor<960x240x1x1xbf16>) -> tensor<960x1x1x240xbf16> loc(#loc212)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 240>} : (tensor<1x240x1x1xbf16>) -> tensor<1x1x1x240xbf16> loc(#loc212)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_320",
+          PartOfOutputName = "Conv_320",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x240xbf16>, tensor<960x1x1x240xbf16>, tensor<960xbf16>) -> tensor<1x1x1x960xbf16> loc(#loc212)
+        %465 = tosa.reshape %464 {new_shape = array<i64: 1, 960, 1, 1>} : (tensor<1x1x1x960xbf16>) -> tensor<1x960x1x1xbf16> loc(#loc212)
+        xten_nn.output %465 : tensor<1x960x1x1xbf16> loc(#loc212)
+      } -> tensor<1x960x1x1xbf16> loc(#loc212)
+      xten_nn.output %461 : tensor<1x960x1x1xbf16> loc(#loc212)
+    } -> tensor<1x960x1x1xbf16> loc(#loc212)
+    %356 = xten_nn.subgraph (%arg5 = %355: tensor<1x960x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_322",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_322",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x1x1xbf16>)  attributes {
+        LayerName = "Add_322",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_322",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_322", OutputName = "Add_322"} : (tensor<1x960x1x1xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x960x1x1xbf16> loc(#loc213)
+        xten_nn.output %463 : tensor<1x960x1x1xbf16> loc(#loc213)
+      } -> tensor<1x960x1x1xbf16> loc(#loc213)
+      xten_nn.output %461 : tensor<1x960x1x1xbf16> loc(#loc213)
+    } -> tensor<1x960x1x1xbf16> loc(#loc213)
+    %357 = xten_nn.subgraph (%arg5 = %356: tensor<1x960x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_325",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_325",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x1x1xbf16>)  attributes {
+        LayerName = "Clip_325",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_325",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_325",
+          OutputName = "Clip_325",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x960x1x1xbf16>) -> tensor<1x960x1x1xbf16> loc(#loc214)
+        xten_nn.output %462 : tensor<1x960x1x1xbf16> loc(#loc214)
+      } -> tensor<1x960x1x1xbf16> loc(#loc214)
+      xten_nn.output %461 : tensor<1x960x1x1xbf16> loc(#loc214)
+    } -> tensor<1x960x1x1xbf16> loc(#loc214)
+    %358 = xten_nn.subgraph (%arg5 = %357: tensor<1x960x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_327",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_327",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x1x1xbf16>)  attributes {
+        LayerName = "Div_327",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_327",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_327",
+          OutputName = "Div_327",
+          shift = 0 : i8} : (tensor<1x960x1x1xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x960x1x1xbf16> loc(#loc215)
+        xten_nn.output %463 : tensor<1x960x1x1xbf16> loc(#loc215)
+      } -> tensor<1x960x1x1xbf16> loc(#loc215)
+      xten_nn.output %461 : tensor<1x960x1x1xbf16> loc(#loc215)
+    } -> tensor<1x960x1x1xbf16> loc(#loc215)
+    %359 = xten_nn.subgraph (%arg5 = %358: tensor<1x960x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#52",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#53",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      Specializes = "TileAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dtype = "bfloat16",
+        config.i_dim_c = 960 : ui32,
+        config.i_dim_h = 1 : ui32,
+        config.i_dim_n = 1 : ui32,
+        config.i_dim_w = 1 : ui32,
+        config.rep_dim_c = 1 : ui32,
+        config.rep_dim_h = 12 : ui32,
+        config.rep_dim_w = 20 : ui32
+      }} {
+      %461 = tosa.tile %arg5 {multiples = array<i64: 1, 1, 12, 20>} : (tensor<1x960x1x1xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc216)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc216)
+    } -> tensor<1x960x12x20xbf16> loc(#loc216)
+    %360 = xten_nn.subgraph (%arg5 = %359: tensor<1x960x12x20xbf16>, %arg6 = %351: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_328",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_328",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x960x12x20xbf16>, %arg8 = %arg6: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Mul_328",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_328",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_328",
+          OutputName = "Mul_328",
+          shift = 0 : i8} : (tensor<1x960x12x20xbf16>, tensor<1x960x12x20xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc216)
+        xten_nn.output %462 : tensor<1x960x12x20xbf16> loc(#loc216)
+      } -> tensor<1x960x12x20xbf16> loc(#loc216)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc216)
+    } -> tensor<1x960x12x20xbf16> loc(#loc216)
+    %361 = xten_nn.subgraph (%arg5 = %360: tensor<1x960x12x20xbf16>, %arg6 = %39: tensor<160x960x1x1xbf16>, %arg7 = %38: tensor<160xbf16>, %arg8 = %341: tensor<1x160x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 3 : index],
+      LayerName = "Conv_329",
+      OfmShare = 3 : index,
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[160, 960, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_330",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg9 = %arg5: tensor<1x960x12x20xbf16>, %arg10 = %arg6: tensor<160x960x1x1xbf16>, %arg11 = %arg7: tensor<160xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_329",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[160, 960, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_329",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %463 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc217)
+        %465 = tosa.reshape %arg10 {new_shape = array<i64: 160, 1, 1, 960>} : (tensor<160x960x1x1xbf16>) -> tensor<160x1x1x960xbf16> loc(#loc217)
+        %466 = tosa.transpose %arg9, %464 : (tensor<1x960x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x960xbf16> loc(#loc217)
+        %467 = tosa.conv2d %466, %465, %arg11 {
+          PartOfLayerName = "Conv_329",
+          PartOfOutputName = "Conv_329",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x960xbf16>, tensor<160x1x1x960xbf16>, tensor<160xbf16>) -> tensor<1x12x20x160xbf16> loc(#loc217)
+        %468 = tosa.transpose %467, %463 : (tensor<1x12x20x160xbf16>, tensor<4xi32>) -> tensor<1x160x12x20xbf16> loc(#loc217)
+        xten_nn.output %468 : tensor<1x160x12x20xbf16> loc(#loc217)
+      } -> tensor<1x160x12x20xbf16> loc(#loc217)
+      %462 = xten_nn.subgraph (%arg9 = %461: tensor<1x160x12x20xbf16>, %arg10 = %arg8: tensor<1x160x12x20xbf16>)  attributes {
+        LayerName = "Add_330",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_330",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.act = 0 : ui8,
+          config.act_type = "LINEAR",
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %463 = tosa.add %arg9, %arg10 {LayerName = "Add_330", OutputName = "Add_330"} : (tensor<1x160x12x20xbf16>, tensor<1x160x12x20xbf16>) -> tensor<1x160x12x20xbf16> loc(#loc218)
+        xten_nn.output %463 : tensor<1x160x12x20xbf16> loc(#loc218)
+      } -> tensor<1x160x12x20xbf16> loc(#loc218)
+      xten_nn.output %462 : tensor<1x160x12x20xbf16> loc(#loc218)
+    } -> tensor<1x160x12x20xbf16> loc(#loc345)
+    %362 = xten_nn.subgraph (%arg5 = %361: tensor<1x160x12x20xbf16>, %arg6 = %37: tensor<960x160x1x1xbf16>, %arg7 = %36: tensor<960xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_331",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[960, 160, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_331",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x160x12x20xbf16>, %arg9 = %arg6: tensor<960x160x1x1xbf16>, %arg10 = %arg7: tensor<960xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_331",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 160, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[960, 160, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_331",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc219)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 960, 1, 1, 160>} : (tensor<960x160x1x1xbf16>) -> tensor<960x1x1x160xbf16> loc(#loc219)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x160x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x160xbf16> loc(#loc219)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_331",
+          PartOfOutputName = "Conv_331",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x160xbf16>, tensor<960x1x1x160xbf16>, tensor<960xbf16>) -> tensor<1x12x20x960xbf16> loc(#loc219)
+        %467 = tosa.transpose %466, %462 : (tensor<1x12x20x960xbf16>, tensor<4xi32>) -> tensor<1x960x12x20xbf16> loc(#loc219)
+        xten_nn.output %467 : tensor<1x960x12x20xbf16> loc(#loc219)
+      } -> tensor<1x960x12x20xbf16> loc(#loc219)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc219)
+    } -> tensor<1x960x12x20xbf16> loc(#loc219)
+    %363 = xten_nn.subgraph (%arg5 = %362: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Add_333",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_333",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Add_333",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_333",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 3.000000e+00 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<3.000000e+00> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.add %arg6, %462 {LayerName = "Add_333", OutputName = "Add_333"} : (tensor<1x960x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc220)
+        xten_nn.output %463 : tensor<1x960x12x20xbf16> loc(#loc220)
+      } -> tensor<1x960x12x20xbf16> loc(#loc220)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc220)
+    } -> tensor<1x960x12x20xbf16> loc(#loc220)
+    %364 = xten_nn.subgraph (%arg5 = %363: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_336",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_336",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Clip_336",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_336",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 6.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_336",
+          OutputName = "Clip_336",
+          max_fp = 6.000000e+00 : f32,
+          max_int = 6 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x960x12x20xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc221)
+        xten_nn.output %462 : tensor<1x960x12x20xbf16> loc(#loc221)
+      } -> tensor<1x960x12x20xbf16> loc(#loc221)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc221)
+    } -> tensor<1x960x12x20xbf16> loc(#loc221)
+    %365 = xten_nn.subgraph (%arg5 = %364: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Div_338",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Div_338",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Div_338",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Div_338",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulAttributeBroadcastingBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.num_kernel_iters = 0 : ui16,
+          config.scalar = 1.660160e-01 : bf16,
+          config.scalar_position = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<1.660160e-01> : tensor<1x1x1x1xbf16>}> : () -> tensor<1x1x1x1xbf16> loc(#loc)
+        %463 = tosa.mul %arg6, %462 {
+          LayerName = "Div_338",
+          OutputName = "Div_338",
+          shift = 0 : i8} : (tensor<1x960x12x20xbf16>, tensor<1x1x1x1xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc222)
+        xten_nn.output %463 : tensor<1x960x12x20xbf16> loc(#loc222)
+      } -> tensor<1x960x12x20xbf16> loc(#loc222)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc222)
+    } -> tensor<1x960x12x20xbf16> loc(#loc222)
+    %366 = xten_nn.subgraph (%arg5 = %362: tensor<1x960x12x20xbf16>, %arg6 = %365: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_339",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_339",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x960x12x20xbf16>, %arg8 = %arg6: tensor<1x960x12x20xbf16>)  attributes {
+        LayerName = "Mul_339",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_339",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_339",
+          OutputName = "Mul_339",
+          shift = 0 : i8} : (tensor<1x960x12x20xbf16>, tensor<1x960x12x20xbf16>) -> tensor<1x960x12x20xbf16> loc(#loc223)
+        xten_nn.output %462 : tensor<1x960x12x20xbf16> loc(#loc223)
+      } -> tensor<1x960x12x20xbf16> loc(#loc223)
+      xten_nn.output %461 : tensor<1x960x12x20xbf16> loc(#loc223)
+    } -> tensor<1x960x12x20xbf16> loc(#loc223)
+    %367 = xten_nn.subgraph (%arg5 = %366: tensor<1x960x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#54",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#55",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 240]> : vector<4xindex>
+        }
+      ],
+      Specializes = "Transpose4dAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dim_0 = 12 : ui32,
+        config.dim_1 = 120 : ui32,
+        config.dim_2 = 20 : ui32,
+        config.dim_3 = 8 : ui32,
+        config.dtype = "bfloat16",
+        config.perm = 6 : ui32
+      }} {
+      %461 = tosa.reshape %arg5 {new_shape = array<i64: 1, 960, 1, 240>} : (tensor<1x960x12x20xbf16>) -> tensor<1x960x1x240xbf16> loc(#loc346)
+      xten_nn.output %461 : tensor<1x960x1x240xbf16> loc(#loc346)
+    } -> tensor<1x960x1x240xbf16> loc(#loc346)
+    %368 = xten_nn.subgraph (%arg5 = %367: tensor<1x960x1x240xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#56",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 240]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#57",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x960x1x240xbf16>)  attributes {
+        LayerName = "Generated-#56",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 240]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Generated-#57",
+        PadValue = 0.000000e+00 : bf16,
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ReduceMeanC8Bf16",
+        Traits = {
+          Reduce = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.full_channel = 960 : ui32,
+          config.full_height = 1 : ui32,
+          config.full_width = 240 : ui32,
+          config.reduce_dim = "W"
+        }} {
+        %462 = xten_nn.reduce_mean %arg6 {axes = array<i64: 3>, keepdims = 1 : i64} : (tensor<1x960x1x240xbf16>) -> tensor<1x960x1x1xbf16> loc(#loc224)
+        xten_nn.output %462 : tensor<1x960x1x1xbf16> loc(#loc224)
+      } -> tensor<1x960x1x1xbf16> loc(#loc224)
+      xten_nn.output %461 : tensor<1x960x1x1xbf16> loc(#loc224)
+    } -> tensor<1x960x1x1xbf16> loc(#loc224)
+    %369 = xten_nn.subgraph (%arg5 = %368: tensor<1x960x1x1xbf16>, %arg6 = %35: tensor<128x960x1x1xbf16>, %arg7 = %34: tensor<128xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_343",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[128, 960, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_343",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x960x1x1xbf16>, %arg9 = %arg6: tensor<128x960x1x1xbf16>, %arg10 = %arg7: tensor<128xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_343",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[128, 960, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_343",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 128, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = tosa.reshape %arg9 {new_shape = array<i64: 128, 1, 1, 960>} : (tensor<128x960x1x1xbf16>) -> tensor<128x1x1x960xbf16> loc(#loc225)
+        %463 = tosa.reshape %arg8 {new_shape = array<i64: 1, 1, 1, 960>} : (tensor<1x960x1x1xbf16>) -> tensor<1x1x1x960xbf16> loc(#loc225)
+        %464 = tosa.conv2d %463, %462, %arg10 {
+          PartOfLayerName = "Conv_343",
+          PartOfOutputName = "Conv_343",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x1x1x960xbf16>, tensor<128x1x1x960xbf16>, tensor<128xbf16>) -> tensor<1x1x1x128xbf16> loc(#loc225)
+        %465 = tosa.reshape %464 {new_shape = array<i64: 1, 128, 1, 1>} : (tensor<1x1x1x128xbf16>) -> tensor<1x128x1x1xbf16> loc(#loc225)
+        xten_nn.output %465 : tensor<1x128x1x1xbf16> loc(#loc225)
+      } -> tensor<1x128x1x1xbf16> loc(#loc225)
+      xten_nn.output %461 : tensor<1x128x1x1xbf16> loc(#loc225)
+    } -> tensor<1x128x1x1xbf16> loc(#loc225)
+    %370 = xten_nn.subgraph (%arg5 = %369: tensor<1x128x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Sigmoid_344",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Sigmoid_344",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 1, 1]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x128x1x1xbf16>)  attributes {
+        LayerName = "Sigmoid_344",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 128, 1, 1]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Sigmoid_344",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 128, 1, 1]> : vector<4xindex>
+          }
+        ],
+        Specializes = "SigmoidTemplatedBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.ENABLE_FP16_AS_BF16 = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.sigmoid %arg6 {LayerName = "Sigmoid_344", OutputName = "Sigmoid_344"} : (tensor<1x128x1x1xbf16>) -> tensor<1x128x1x1xbf16> loc(#loc226)
+        xten_nn.output %462 : tensor<1x128x1x1xbf16> loc(#loc226)
+      } -> tensor<1x128x1x1xbf16> loc(#loc226)
+      xten_nn.output %461 : tensor<1x128x1x1xbf16> loc(#loc226)
+    } -> tensor<1x128x1x1xbf16> loc(#loc226)
+    %371 = xten_nn.subgraph (%arg5 = %370: tensor<1x128x1x1xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Generated-#58",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 1, 1]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Generated-#59",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+        }
+      ],
+      Specializes = "TileAdf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dtype = "bfloat16",
+        config.i_dim_c = 128 : ui32,
+        config.i_dim_h = 1 : ui32,
+        config.i_dim_n = 1 : ui32,
+        config.i_dim_w = 1 : ui32,
+        config.rep_dim_c = 1 : ui32,
+        config.rep_dim_h = 12 : ui32,
+        config.rep_dim_w = 20 : ui32
+      }} {
+      %461 = tosa.tile %arg5 {multiples = array<i64: 1, 1, 12, 20>} : (tensor<1x128x1x1xbf16>) -> tensor<1x128x12x20xbf16> loc(#loc227)
+      xten_nn.output %461 : tensor<1x128x12x20xbf16> loc(#loc227)
+    } -> tensor<1x128x12x20xbf16> loc(#loc227)
+    %372 = xten_nn.subgraph (%arg5 = %366: tensor<1x960x12x20xbf16>, %arg6 = %33: tensor<128x960x1x1xbf16>, %arg7 = %32: tensor<128xbf16>, %arg8 = %371: tensor<1x128x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 3 : index],
+      LayerName = "Conv_340",
+      OfmShare = 3 : index,
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[128, 960, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_345",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg9 = %arg5: tensor<1x960x12x20xbf16>, %arg10 = %arg6: tensor<128x960x1x1xbf16>, %arg11 = %arg7: tensor<128xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_340",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 960, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[128, 960, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_341",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %463 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc348)
+        %465 = tosa.reshape %arg10 {new_shape = array<i64: 128, 1, 1, 960>} : (tensor<128x960x1x1xbf16>) -> tensor<128x1x1x960xbf16> loc(#loc348)
+        %466 = tosa.transpose %arg9, %464 : (tensor<1x960x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x960xbf16> loc(#loc348)
+        %467 = tosa.conv2d %466, %465, %arg11 {
+          PartOfLayerName = "Conv_340",
+          PartOfOutputName = "Conv_340",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x960xbf16>, tensor<128x1x1x960xbf16>, tensor<128xbf16>) -> tensor<1x12x20x128xbf16> loc(#loc228)
+        %468 = tosa.clamp %467 {
+          LayerName = "Relu_341",
+          OutputName = "Relu_341",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x12x20x128xbf16>) -> tensor<1x12x20x128xbf16> loc(#loc229)
+        %469 = tosa.transpose %468, %463 : (tensor<1x12x20x128xbf16>, tensor<4xi32>) -> tensor<1x128x12x20xbf16> loc(#loc348)
+        xten_nn.output %469 : tensor<1x128x12x20xbf16> loc(#loc229)
+      } -> tensor<1x128x12x20xbf16> loc(#loc348)
+      %462 = xten_nn.subgraph (%arg9 = %461: tensor<1x128x12x20xbf16>, %arg10 = %arg8: tensor<1x128x12x20xbf16>)  attributes {
+        LayerName = "Mul_345",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_345",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %463 = tosa.mul %arg9, %arg10 {
+          LayerName = "Mul_345",
+          OutputName = "Mul_345",
+          shift = 0 : i8} : (tensor<1x128x12x20xbf16>, tensor<1x128x12x20xbf16>) -> tensor<1x128x12x20xbf16> loc(#loc227)
+        xten_nn.output %463 : tensor<1x128x12x20xbf16> loc(#loc227)
+      } -> tensor<1x128x12x20xbf16> loc(#loc227)
+      xten_nn.output %462 : tensor<1x128x12x20xbf16> loc(#loc227)
+    } -> tensor<1x128x12x20xbf16> loc(#loc347)
+    %373 = xten_nn.subgraph (%arg5 = %372: tensor<1x128x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_349_Duplicated#0",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_349_Duplicated#0",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 128 : ui32,
+        config.dim_h = 12 : ui32,
+        config.dim_w = 20 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 64 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 0 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_349",
+        PartOfOutputName = "Split_349",
+        size = array<i64: 1, 64, 12, 20>,
+        start = array<i64: 0, 0, 0, 0>} : (tensor<1x128x12x20xbf16>) -> tensor<1x64x12x20xbf16> loc(#loc230)
+      xten_nn.output %461 : tensor<1x64x12x20xbf16> loc(#loc230)
+    } -> tensor<1x64x12x20xbf16> loc(#loc230)
+    %374 = xten_nn.subgraph (%arg5 = %372: tensor<1x128x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_349_Duplicated#1",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_349_Duplicated#1",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 128 : ui32,
+        config.dim_h = 12 : ui32,
+        config.dim_w = 20 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 128 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 64 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_349",
+        PartOfOutputName = "Split_349",
+        size = array<i64: 1, 64, 12, 20>,
+        start = array<i64: 0, 64, 0, 0>} : (tensor<1x128x12x20xbf16>) -> tensor<1x64x12x20xbf16> loc(#loc230)
+      xten_nn.output %461 : tensor<1x64x12x20xbf16> loc(#loc230)
+    } -> tensor<1x64x12x20xbf16> loc(#loc230)
+    %375 = xten_nn.subgraph (%arg5 = %374: tensor<1x64x12x20xbf16>, %arg6 = %arg4: tensor<1x64x12x20xbf16>)  attributes {
+      Axis = 1 : i32,
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Concat_350",
+      Op = "Concat",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm1",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm2",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Concat_350",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "PseudoOp",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+        }
+      ],
+      current_data_format = "NCHW",
+      data_format = "HCWN"} {
+      %461 = tosa.concat %arg5, %arg6 {
+        LayerName = "Concat_350",
+        OutputName = "Concat_350",
+        axis = 1 : i32} : (tensor<1x64x12x20xbf16>, tensor<1x64x12x20xbf16>) -> tensor<1x128x12x20xbf16> loc(#loc231)
+      xten_nn.output %461 : tensor<1x128x12x20xbf16> loc(#loc231)
+    } -> tensor<1x128x12x20xbf16> loc(#loc231)
+    %376 = xten_nn.subgraph (%arg5 = %375: tensor<1x128x12x20xbf16>, %arg6 = %31: tensor<128x128x3x3xbf16>, %arg7 = %30: tensor<128xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_351",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[128, 128, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_351",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x128x12x20xbf16>, %arg9 = %arg6: tensor<128x128x3x3xbf16>, %arg10 = %arg7: tensor<128xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_351",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[128, 128, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_351",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 3 : ui8,
+          config.ksize.width = 3 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = tosa.transpose %arg9, %463 : (tensor<128x128x3x3xbf16>, tensor<4xi32>) -> tensor<128x3x3x128xbf16> loc(#loc232)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x128x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x128xbf16> loc(#loc232)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_351",
+          PartOfOutputName = "Conv_351",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x128xbf16>, tensor<128x3x3x128xbf16>, tensor<128xbf16>) -> tensor<1x12x20x128xbf16> loc(#loc232)
+        %467 = tosa.transpose %466, %462 : (tensor<1x12x20x128xbf16>, tensor<4xi32>) -> tensor<1x128x12x20xbf16> loc(#loc232)
+        xten_nn.output %467 : tensor<1x128x12x20xbf16> loc(#loc232)
+      } -> tensor<1x128x12x20xbf16> loc(#loc232)
+      xten_nn.output %461 : tensor<1x128x12x20xbf16> loc(#loc232)
+    } -> tensor<1x128x12x20xbf16> loc(#loc232)
+    %377 = xten_nn.subgraph (%arg5 = %376: tensor<1x128x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Sigmoid_352",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Sigmoid_352",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x128x12x20xbf16>)  attributes {
+        LayerName = "Sigmoid_352",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Sigmoid_352",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "SigmoidTemplatedBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.ENABLE_FP16_AS_BF16 = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.sigmoid %arg6 {LayerName = "Sigmoid_352", OutputName = "Sigmoid_352"} : (tensor<1x128x12x20xbf16>) -> tensor<1x128x12x20xbf16> loc(#loc233)
+        xten_nn.output %462 : tensor<1x128x12x20xbf16> loc(#loc233)
+      } -> tensor<1x128x12x20xbf16> loc(#loc233)
+      xten_nn.output %461 : tensor<1x128x12x20xbf16> loc(#loc233)
+    } -> tensor<1x128x12x20xbf16> loc(#loc233)
+    %378 = xten_nn.subgraph (%arg5 = %377: tensor<1x128x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_353_Duplicated#1",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_353_Duplicated#1",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 128 : ui32,
+        config.dim_h = 12 : ui32,
+        config.dim_w = 20 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 128 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 64 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_353",
+        PartOfOutputName = "Split_353",
+        size = array<i64: 1, 64, 12, 20>,
+        start = array<i64: 0, 64, 0, 0>} : (tensor<1x128x12x20xbf16>) -> tensor<1x64x12x20xbf16> loc(#loc234)
+      xten_nn.output %461 : tensor<1x64x12x20xbf16> loc(#loc234)
+    } -> tensor<1x64x12x20xbf16> loc(#loc234)
+    %379 = xten_nn.subgraph (%arg5 = %27: tensor<1x64x12x20xbf16>, %arg6 = %378: tensor<1x64x12x20xbf16>)  attributes {
+      IfmOperands = [1 : index],
+      LayerName = "Sub_359",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Sub_359",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x64x12x20xbf16>, %arg8 = %arg6: tensor<1x64x12x20xbf16>)  attributes {
+        LayerName = "Sub_359",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Sub_359",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "SubBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.sub %arg7, %arg8 {LayerName = "Sub_359", OutputName = "Sub_359"} : (tensor<1x64x12x20xbf16>, tensor<1x64x12x20xbf16>) -> tensor<1x64x12x20xbf16> loc(#loc5)
+        xten_nn.output %462 : tensor<1x64x12x20xbf16> loc(#loc5)
+      } -> tensor<1x64x12x20xbf16> loc(#loc5)
+      xten_nn.output %461 : tensor<1x64x12x20xbf16> loc(#loc5)
+    } -> tensor<1x64x12x20xbf16> loc(#loc5)
+    %380 = xten_nn.subgraph (%arg5 = %379: tensor<1x64x12x20xbf16>, %arg6 = %arg4: tensor<1x64x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_360",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_360",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x64x12x20xbf16>, %arg8 = %arg6: tensor<1x64x12x20xbf16>)  attributes {
+        LayerName = "Mul_360",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_360",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_360",
+          OutputName = "Mul_360",
+          shift = 0 : i8} : (tensor<1x64x12x20xbf16>, tensor<1x64x12x20xbf16>) -> tensor<1x64x12x20xbf16> loc(#loc240)
+        xten_nn.output %462 : tensor<1x64x12x20xbf16> loc(#loc240)
+      } -> tensor<1x64x12x20xbf16> loc(#loc240)
+      xten_nn.output %461 : tensor<1x64x12x20xbf16> loc(#loc240)
+    } -> tensor<1x64x12x20xbf16> loc(#loc240)
+    %381 = xten_nn.subgraph (%arg5 = %377: tensor<1x128x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_353_Duplicated#0",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_353_Duplicated#0",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 128 : ui32,
+        config.dim_h = 12 : ui32,
+        config.dim_w = 20 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 64 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 0 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_353",
+        PartOfOutputName = "Split_353",
+        size = array<i64: 1, 64, 12, 20>,
+        start = array<i64: 0, 0, 0, 0>} : (tensor<1x128x12x20xbf16>) -> tensor<1x64x12x20xbf16> loc(#loc234)
+      xten_nn.output %461 : tensor<1x64x12x20xbf16> loc(#loc234)
+    } -> tensor<1x64x12x20xbf16> loc(#loc234)
+    %382 = xten_nn.subgraph (%arg5 = %381: tensor<1x64x12x20xbf16>, %arg6 = %arg4: tensor<1x64x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_354",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_354",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x64x12x20xbf16>, %arg8 = %arg6: tensor<1x64x12x20xbf16>)  attributes {
+        LayerName = "Mul_354",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_354",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_354",
+          OutputName = "Mul_354",
+          shift = 0 : i8} : (tensor<1x64x12x20xbf16>, tensor<1x64x12x20xbf16>) -> tensor<1x64x12x20xbf16> loc(#loc235)
+        xten_nn.output %462 : tensor<1x64x12x20xbf16> loc(#loc235)
+      } -> tensor<1x64x12x20xbf16> loc(#loc235)
+      xten_nn.output %461 : tensor<1x64x12x20xbf16> loc(#loc235)
+    } -> tensor<1x64x12x20xbf16> loc(#loc235)
+    %383 = xten_nn.subgraph (%arg5 = %374: tensor<1x64x12x20xbf16>, %arg6 = %382: tensor<1x64x12x20xbf16>)  attributes {
+      Axis = 1 : i32,
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Concat_355",
+      Op = "Concat",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm1",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm2",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Concat_355",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "PseudoOp",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+        }
+      ],
+      current_data_format = "NCHW",
+      data_format = "HCWN"} {
+      %461 = tosa.concat %arg5, %arg6 {
+        LayerName = "Concat_355",
+        OutputName = "Concat_355",
+        axis = 1 : i32} : (tensor<1x64x12x20xbf16>, tensor<1x64x12x20xbf16>) -> tensor<1x128x12x20xbf16> loc(#loc236)
+      xten_nn.output %461 : tensor<1x128x12x20xbf16> loc(#loc236)
+    } -> tensor<1x128x12x20xbf16> loc(#loc236)
+    %384 = xten_nn.subgraph (%arg5 = %383: tensor<1x128x12x20xbf16>, %arg6 = %29: tensor<64x128x3x3xbf16>, %arg7 = %28: tensor<64xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_356",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[64, 128, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_356",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x128x12x20xbf16>, %arg9 = %arg6: tensor<64x128x3x3xbf16>, %arg10 = %arg7: tensor<64xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_356",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[64, 128, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_356",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 3 : ui8,
+          config.ksize.width = 3 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = tosa.transpose %arg9, %463 : (tensor<64x128x3x3xbf16>, tensor<4xi32>) -> tensor<64x3x3x128xbf16> loc(#loc237)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x128x12x20xbf16>, tensor<4xi32>) -> tensor<1x12x20x128xbf16> loc(#loc237)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_356",
+          PartOfOutputName = "Conv_356",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x12x20x128xbf16>, tensor<64x3x3x128xbf16>, tensor<64xbf16>) -> tensor<1x12x20x64xbf16> loc(#loc237)
+        %467 = tosa.transpose %466, %462 : (tensor<1x12x20x64xbf16>, tensor<4xi32>) -> tensor<1x64x12x20xbf16> loc(#loc237)
+        xten_nn.output %467 : tensor<1x64x12x20xbf16> loc(#loc237)
+      } -> tensor<1x64x12x20xbf16> loc(#loc237)
+      xten_nn.output %461 : tensor<1x64x12x20xbf16> loc(#loc237)
+    } -> tensor<1x64x12x20xbf16> loc(#loc237)
+    %385 = xten_nn.subgraph (%arg5 = %384: tensor<1x64x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Tanh_357",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Tanh_357",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x64x12x20xbf16>)  attributes {
+        LayerName = "Tanh_357",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Tanh_357",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "TanhTemplatedBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.ENABLE_FP16_AS_BF16 = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.tanh %arg6 {LayerName = "Tanh_357", OutputName = "Tanh_357"} : (tensor<1x64x12x20xbf16>) -> tensor<1x64x12x20xbf16> loc(#loc238)
+        xten_nn.output %462 : tensor<1x64x12x20xbf16> loc(#loc238)
+      } -> tensor<1x64x12x20xbf16> loc(#loc238)
+      xten_nn.output %461 : tensor<1x64x12x20xbf16> loc(#loc238)
+    } -> tensor<1x64x12x20xbf16> loc(#loc238)
+    %386 = xten_nn.subgraph (%arg5 = %378: tensor<1x64x12x20xbf16>, %arg6 = %385: tensor<1x64x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_361",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_361",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x64x12x20xbf16>, %arg8 = %arg6: tensor<1x64x12x20xbf16>)  attributes {
+        LayerName = "Mul_361",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_361",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_361",
+          OutputName = "Mul_361",
+          shift = 0 : i8} : (tensor<1x64x12x20xbf16>, tensor<1x64x12x20xbf16>) -> tensor<1x64x12x20xbf16> loc(#loc239)
+        xten_nn.output %462 : tensor<1x64x12x20xbf16> loc(#loc239)
+      } -> tensor<1x64x12x20xbf16> loc(#loc239)
+      xten_nn.output %461 : tensor<1x64x12x20xbf16> loc(#loc239)
+    } -> tensor<1x64x12x20xbf16> loc(#loc239)
+    %387 = xten_nn.subgraph (%arg5 = %380: tensor<1x64x12x20xbf16>, %arg6 = %386: tensor<1x64x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Add_362",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_362",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x64x12x20xbf16>, %arg8 = %arg6: tensor<1x64x12x20xbf16>)  attributes {
+        LayerName = "Add_362",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_362",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.act = 0 : ui8,
+          config.act_type = "LINEAR",
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.add %arg7, %arg8 {LayerName = "Add_362", OutputName = "Add_362"} : (tensor<1x64x12x20xbf16>, tensor<1x64x12x20xbf16>) -> tensor<1x64x12x20xbf16> loc(#loc241)
+        xten_nn.output %462 : tensor<1x64x12x20xbf16> loc(#loc241)
+      } -> tensor<1x64x12x20xbf16> loc(#loc241)
+      xten_nn.output %461 : tensor<1x64x12x20xbf16> loc(#loc241)
+    } -> tensor<1x64x12x20xbf16> loc(#loc241)
+    %388 = xten_nn.subgraph (%arg5 = %373: tensor<1x64x12x20xbf16>, %arg6 = %387: tensor<1x64x12x20xbf16>)  attributes {
+      Axis = 1 : i32,
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Concat_363",
+      Op = "Concat",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm1",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm2",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 64, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Concat_363",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "PseudoOp",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+        }
+      ],
+      current_data_format = "NCHW",
+      data_format = "HCWN"} {
+      %461 = tosa.concat %arg5, %arg6 {
+        LayerName = "Concat_363",
+        OutputName = "Concat_363",
+        axis = 1 : i32} : (tensor<1x64x12x20xbf16>, tensor<1x64x12x20xbf16>) -> tensor<1x128x12x20xbf16> loc(#loc242)
+      xten_nn.output %461 : tensor<1x128x12x20xbf16> loc(#loc242)
+    } -> tensor<1x128x12x20xbf16> loc(#loc242)
+    %389 = xten_nn.subgraph (%arg5 = %388: tensor<1x128x12x20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Resize_365",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 12, 20]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Resize_365",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 24, 40]> : vector<4xindex>
+        }
+      ],
+      Specializes = "ResizeAdf",
+      With = {
+        config.co_trans_mode = 1 : ui32,
+        config.dim_0 = 1 : ui32,
+        config.dim_1 = 128 : ui32,
+        config.dim_2 = 12 : ui32,
+        config.dim_3 = 20 : ui32,
+        config.dtype = "bfloat16",
+        config.mode = 1 : ui32,
+        config.nearest_mode = 0 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.output_H = 24 : ui32,
+        config.output_W = 40 : ui32
+      }} {
+      %461 = xten_nn.resize %arg5 {
+        LayerName = "Resize_365",
+        OutputName = "Resize_365",
+        coordinate_transformation_mode = 1 : i64,
+        mode = 1 : i64,
+        nearest_mode = 0 : i64,
+        scales = array<f32: 1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00>} : (tensor<1x128x12x20xbf16>) -> tensor<1x128x24x40xbf16> loc(#loc243)
+      xten_nn.output %461 : tensor<1x128x24x40xbf16> loc(#loc243)
+    } -> tensor<1x128x24x40xbf16> loc(#loc243)
+    %390 = xten_nn.subgraph (%arg5 = %389: tensor<1x128x24x40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Slice_371",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 24, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Slice_371",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 23, 40]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "H",
+        config.dim_c = 128 : ui32,
+        config.dim_h = 24 : ui32,
+        config.dim_w = 40 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 23 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 0 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        LayerName = "Slice_371",
+        OutputName = "Slice_371",
+        size = array<i64: 1, 128, 23, 40>,
+        start = array<i64: 0, 0, 0, 0>} : (tensor<1x128x24x40xbf16>) -> tensor<1x128x23x40xbf16> loc(#loc244)
+      xten_nn.output %461 : tensor<1x128x23x40xbf16> loc(#loc244)
+    } -> tensor<1x128x23x40xbf16> loc(#loc244)
+    %391 = xten_nn.subgraph (%arg5 = %166: tensor<1x3x180x320xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "AveragePool_346",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        }
+      ],
+      OutputName = "AveragePool_346",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x3x180x320xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        HWPaddingNotCounted = [[0, 0], [0, 0]],
+        LayerName = "AveragePool_346",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+          }
+        ],
+        OutputName = "AveragePool_346",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AvgPool2dBf16",
+        With = {
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.ksize = 2 : ui8,
+          config.stride_log2 = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc12)
+        %464 = tosa.transpose %arg6, %463 : (tensor<1x3x180x320xbf16>, tensor<4xi32>) -> tensor<1x180x320x3xbf16> loc(#loc12)
+        %465 = tosa.avg_pool2d %464 {
+          PartOfLayerName = "AveragePool_346",
+          PartOfOutputName = "AveragePool_346",
+          acc_type = f32,
+          kernel = array<i64: 2, 2>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 2, 2>} : (tensor<1x180x320x3xbf16>) -> tensor<1x90x160x3xbf16> loc(#loc12)
+        %466 = tosa.transpose %465, %462 : (tensor<1x90x160x3xbf16>, tensor<4xi32>) -> tensor<1x3x90x160xbf16> loc(#loc12)
+        xten_nn.output %466 : tensor<1x3x90x160xbf16> loc(#loc12)
+      } -> tensor<1x3x90x160xbf16> loc(#loc12)
+      xten_nn.output %461 : tensor<1x3x90x160xbf16> loc(#loc12)
+    } -> tensor<1x3x90x160xbf16> loc(#loc12)
+    %392 = xten_nn.subgraph (%arg5 = %391: tensor<1x3x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "AveragePool_347",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "AveragePool_347",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x3x90x160xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        HWPaddingNotCounted = [[0, 0], [0, 0]],
+        LayerName = "AveragePool_347",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 90, 160]> : vector<4xindex>
+          }
+        ],
+        OutputName = "AveragePool_347",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AvgPool2dBf16",
+        With = {
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.ksize = 2 : ui8,
+          config.stride_log2 = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc245)
+        %464 = tosa.transpose %arg6, %463 : (tensor<1x3x90x160xbf16>, tensor<4xi32>) -> tensor<1x90x160x3xbf16> loc(#loc245)
+        %465 = tosa.avg_pool2d %464 {
+          PartOfLayerName = "AveragePool_347",
+          PartOfOutputName = "AveragePool_347",
+          acc_type = f32,
+          kernel = array<i64: 2, 2>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 2, 2>} : (tensor<1x90x160x3xbf16>) -> tensor<1x45x80x3xbf16> loc(#loc245)
+        %466 = tosa.transpose %465, %462 : (tensor<1x45x80x3xbf16>, tensor<4xi32>) -> tensor<1x3x45x80xbf16> loc(#loc245)
+        xten_nn.output %466 : tensor<1x3x45x80xbf16> loc(#loc245)
+      } -> tensor<1x3x45x80xbf16> loc(#loc245)
+      xten_nn.output %461 : tensor<1x3x45x80xbf16> loc(#loc245)
+    } -> tensor<1x3x45x80xbf16> loc(#loc245)
+    %393 = xten_nn.subgraph (%arg5 = %392: tensor<1x3x45x80xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "AveragePool_348",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "AveragePool_348",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x3x45x80xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 1], [0, 0]],
+        HWPaddingNotCounted = [[0, 1], [0, 0]],
+        LayerName = "AveragePool_348",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 45, 80]> : vector<4xindex>
+          }
+        ],
+        OutputName = "AveragePool_348",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AvgPool2dBf16",
+        With = {
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.ksize = 2 : ui8,
+          config.stride_log2 = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc246)
+        %464 = tosa.transpose %arg6, %463 : (tensor<1x3x45x80xbf16>, tensor<4xi32>) -> tensor<1x45x80x3xbf16> loc(#loc246)
+        %465 = tosa.avg_pool2d %464 {
+          PartOfLayerName = "AveragePool_348",
+          PartOfOutputName = "AveragePool_348",
+          acc_type = f32,
+          kernel = array<i64: 2, 2>,
+          pad = array<i64: 0, 1, 0, 0>,
+          stride = array<i64: 2, 2>} : (tensor<1x45x80x3xbf16>) -> tensor<1x23x40x3xbf16> loc(#loc246)
+        %466 = tosa.transpose %465, %462 : (tensor<1x23x40x3xbf16>, tensor<4xi32>) -> tensor<1x3x23x40xbf16> loc(#loc246)
+        xten_nn.output %466 : tensor<1x3x23x40xbf16> loc(#loc246)
+      } -> tensor<1x3x23x40xbf16> loc(#loc246)
+      xten_nn.output %461 : tensor<1x3x23x40xbf16> loc(#loc246)
+    } -> tensor<1x3x23x40xbf16> loc(#loc246)
+    %394 = xten_nn.subgraph (%arg5 = %390: tensor<1x128x23x40xbf16>, %arg6 = %217: tensor<1x40x23x40xbf16>, %arg7 = %393: tensor<1x3x23x40xbf16>)  attributes {
+      Axis = 1 : i32,
+      IfmOperands = [0 : index, 1 : index, 2 : index],
+      LayerName = "Concat_372",
+      Op = "Concat",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm1",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 128, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm2",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm3",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Concat_372",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "PseudoOp",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 171, 23, 40]> : vector<4xindex>
+        }
+      ],
+      current_data_format = "NCHW",
+      data_format = "HCWN"} {
+      %461 = tosa.concat %arg5, %arg6, %arg7 {
+        LayerName = "Concat_372",
+        OutputName = "Concat_372",
+        axis = 1 : i32} : (tensor<1x128x23x40xbf16>, tensor<1x40x23x40xbf16>, tensor<1x3x23x40xbf16>) -> tensor<1x171x23x40xbf16> loc(#loc247)
+      xten_nn.output %461 : tensor<1x171x23x40xbf16> loc(#loc247)
+    } -> tensor<1x171x23x40xbf16> loc(#loc247)
+    %395 = xten_nn.subgraph (%arg5 = %394: tensor<1x171x23x40xbf16>, %arg6 = %26: tensor<80x171x3x3xbf16>, %arg7 = %25: tensor<80xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_373",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 171, 23, 40]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[80, 171, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_374",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x171x23x40xbf16>, %arg9 = %arg6: tensor<80x171x3x3xbf16>, %arg10 = %arg7: tensor<80xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_373",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 171, 23, 40]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[80, 171, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_374",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 3 : ui8,
+          config.ksize.width = 3 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = tosa.transpose %arg9, %463 : (tensor<80x171x3x3xbf16>, tensor<4xi32>) -> tensor<80x3x3x171xbf16> loc(#loc349)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x171x23x40xbf16>, tensor<4xi32>) -> tensor<1x23x40x171xbf16> loc(#loc349)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_373",
+          PartOfOutputName = "Conv_373",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x23x40x171xbf16>, tensor<80x3x3x171xbf16>, tensor<80xbf16>) -> tensor<1x23x40x80xbf16> loc(#loc248)
+        %467 = tosa.clamp %466 {
+          LayerName = "Relu_374",
+          OutputName = "Relu_374",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x23x40x80xbf16>) -> tensor<1x23x40x80xbf16> loc(#loc249)
+        %468 = tosa.transpose %467, %462 : (tensor<1x23x40x80xbf16>, tensor<4xi32>) -> tensor<1x80x23x40xbf16> loc(#loc349)
+        xten_nn.output %468 : tensor<1x80x23x40xbf16> loc(#loc249)
+      } -> tensor<1x80x23x40xbf16> loc(#loc349)
+      xten_nn.output %461 : tensor<1x80x23x40xbf16> loc(#loc349)
+    } -> tensor<1x80x23x40xbf16> loc(#loc349)
+    %396 = xten_nn.subgraph (%arg5 = %395: tensor<1x80x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_375_Duplicated#0",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_375_Duplicated#0",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 80 : ui32,
+        config.dim_h = 23 : ui32,
+        config.dim_w = 40 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 40 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 0 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_375",
+        PartOfOutputName = "Split_375",
+        size = array<i64: 1, 40, 23, 40>,
+        start = array<i64: 0, 0, 0, 0>} : (tensor<1x80x23x40xbf16>) -> tensor<1x40x23x40xbf16> loc(#loc250)
+      xten_nn.output %461 : tensor<1x40x23x40xbf16> loc(#loc250)
+    } -> tensor<1x40x23x40xbf16> loc(#loc250)
+    %397 = xten_nn.subgraph (%arg5 = %395: tensor<1x80x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_375_Duplicated#1",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_375_Duplicated#1",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 80 : ui32,
+        config.dim_h = 23 : ui32,
+        config.dim_w = 40 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 80 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 40 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_375",
+        PartOfOutputName = "Split_375",
+        size = array<i64: 1, 40, 23, 40>,
+        start = array<i64: 0, 40, 0, 0>} : (tensor<1x80x23x40xbf16>) -> tensor<1x40x23x40xbf16> loc(#loc250)
+      xten_nn.output %461 : tensor<1x40x23x40xbf16> loc(#loc250)
+    } -> tensor<1x40x23x40xbf16> loc(#loc250)
+    %398 = xten_nn.subgraph (%arg5 = %397: tensor<1x40x23x40xbf16>, %arg6 = %arg3: tensor<1x40x23x40xbf16>)  attributes {
+      Axis = 1 : i32,
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Concat_376",
+      Op = "Concat",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm1",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm2",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Concat_376",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "PseudoOp",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+        }
+      ],
+      current_data_format = "NCHW",
+      data_format = "HCWN"} {
+      %461 = tosa.concat %arg5, %arg6 {
+        LayerName = "Concat_376",
+        OutputName = "Concat_376",
+        axis = 1 : i32} : (tensor<1x40x23x40xbf16>, tensor<1x40x23x40xbf16>) -> tensor<1x80x23x40xbf16> loc(#loc251)
+      xten_nn.output %461 : tensor<1x80x23x40xbf16> loc(#loc251)
+    } -> tensor<1x80x23x40xbf16> loc(#loc251)
+    %399 = xten_nn.subgraph (%arg5 = %398: tensor<1x80x23x40xbf16>, %arg6 = %24: tensor<80x80x3x3xbf16>, %arg7 = %23: tensor<80xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_377",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[80, 80, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_377",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x80x23x40xbf16>, %arg9 = %arg6: tensor<80x80x3x3xbf16>, %arg10 = %arg7: tensor<80xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_377",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[80, 80, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_377",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 3 : ui8,
+          config.ksize.width = 3 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = tosa.transpose %arg9, %463 : (tensor<80x80x3x3xbf16>, tensor<4xi32>) -> tensor<80x3x3x80xbf16> loc(#loc252)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x80x23x40xbf16>, tensor<4xi32>) -> tensor<1x23x40x80xbf16> loc(#loc252)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_377",
+          PartOfOutputName = "Conv_377",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x23x40x80xbf16>, tensor<80x3x3x80xbf16>, tensor<80xbf16>) -> tensor<1x23x40x80xbf16> loc(#loc252)
+        %467 = tosa.transpose %466, %462 : (tensor<1x23x40x80xbf16>, tensor<4xi32>) -> tensor<1x80x23x40xbf16> loc(#loc252)
+        xten_nn.output %467 : tensor<1x80x23x40xbf16> loc(#loc252)
+      } -> tensor<1x80x23x40xbf16> loc(#loc252)
+      xten_nn.output %461 : tensor<1x80x23x40xbf16> loc(#loc252)
+    } -> tensor<1x80x23x40xbf16> loc(#loc252)
+    %400 = xten_nn.subgraph (%arg5 = %399: tensor<1x80x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Sigmoid_378",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Sigmoid_378",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x80x23x40xbf16>)  attributes {
+        LayerName = "Sigmoid_378",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Sigmoid_378",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "SigmoidTemplatedBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.ENABLE_FP16_AS_BF16 = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.sigmoid %arg6 {LayerName = "Sigmoid_378", OutputName = "Sigmoid_378"} : (tensor<1x80x23x40xbf16>) -> tensor<1x80x23x40xbf16> loc(#loc253)
+        xten_nn.output %462 : tensor<1x80x23x40xbf16> loc(#loc253)
+      } -> tensor<1x80x23x40xbf16> loc(#loc253)
+      xten_nn.output %461 : tensor<1x80x23x40xbf16> loc(#loc253)
+    } -> tensor<1x80x23x40xbf16> loc(#loc253)
+    %401 = xten_nn.subgraph (%arg5 = %400: tensor<1x80x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_379_Duplicated#1",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_379_Duplicated#1",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 80 : ui32,
+        config.dim_h = 23 : ui32,
+        config.dim_w = 40 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 80 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 40 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_379",
+        PartOfOutputName = "Split_379",
+        size = array<i64: 1, 40, 23, 40>,
+        start = array<i64: 0, 40, 0, 0>} : (tensor<1x80x23x40xbf16>) -> tensor<1x40x23x40xbf16> loc(#loc254)
+      xten_nn.output %461 : tensor<1x40x23x40xbf16> loc(#loc254)
+    } -> tensor<1x40x23x40xbf16> loc(#loc254)
+    %402 = xten_nn.subgraph (%arg5 = %20: tensor<1x40x23x40xbf16>, %arg6 = %401: tensor<1x40x23x40xbf16>)  attributes {
+      IfmOperands = [1 : index],
+      LayerName = "Sub_385",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Sub_385",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x40x23x40xbf16>, %arg8 = %arg6: tensor<1x40x23x40xbf16>)  attributes {
+        LayerName = "Sub_385",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Sub_385",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "SubBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.sub %arg7, %arg8 {LayerName = "Sub_385", OutputName = "Sub_385"} : (tensor<1x40x23x40xbf16>, tensor<1x40x23x40xbf16>) -> tensor<1x40x23x40xbf16> loc(#loc4)
+        xten_nn.output %462 : tensor<1x40x23x40xbf16> loc(#loc4)
+      } -> tensor<1x40x23x40xbf16> loc(#loc4)
+      xten_nn.output %461 : tensor<1x40x23x40xbf16> loc(#loc4)
+    } -> tensor<1x40x23x40xbf16> loc(#loc4)
+    %403 = xten_nn.subgraph (%arg5 = %402: tensor<1x40x23x40xbf16>, %arg6 = %arg3: tensor<1x40x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_386",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_386",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x40x23x40xbf16>, %arg8 = %arg6: tensor<1x40x23x40xbf16>)  attributes {
+        LayerName = "Mul_386",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_386",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_386",
+          OutputName = "Mul_386",
+          shift = 0 : i8} : (tensor<1x40x23x40xbf16>, tensor<1x40x23x40xbf16>) -> tensor<1x40x23x40xbf16> loc(#loc260)
+        xten_nn.output %462 : tensor<1x40x23x40xbf16> loc(#loc260)
+      } -> tensor<1x40x23x40xbf16> loc(#loc260)
+      xten_nn.output %461 : tensor<1x40x23x40xbf16> loc(#loc260)
+    } -> tensor<1x40x23x40xbf16> loc(#loc260)
+    %404 = xten_nn.subgraph (%arg5 = %400: tensor<1x80x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_379_Duplicated#0",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_379_Duplicated#0",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 80 : ui32,
+        config.dim_h = 23 : ui32,
+        config.dim_w = 40 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 40 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 0 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_379",
+        PartOfOutputName = "Split_379",
+        size = array<i64: 1, 40, 23, 40>,
+        start = array<i64: 0, 0, 0, 0>} : (tensor<1x80x23x40xbf16>) -> tensor<1x40x23x40xbf16> loc(#loc254)
+      xten_nn.output %461 : tensor<1x40x23x40xbf16> loc(#loc254)
+    } -> tensor<1x40x23x40xbf16> loc(#loc254)
+    %405 = xten_nn.subgraph (%arg5 = %404: tensor<1x40x23x40xbf16>, %arg6 = %arg3: tensor<1x40x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_380",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_380",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x40x23x40xbf16>, %arg8 = %arg6: tensor<1x40x23x40xbf16>)  attributes {
+        LayerName = "Mul_380",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_380",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_380",
+          OutputName = "Mul_380",
+          shift = 0 : i8} : (tensor<1x40x23x40xbf16>, tensor<1x40x23x40xbf16>) -> tensor<1x40x23x40xbf16> loc(#loc255)
+        xten_nn.output %462 : tensor<1x40x23x40xbf16> loc(#loc255)
+      } -> tensor<1x40x23x40xbf16> loc(#loc255)
+      xten_nn.output %461 : tensor<1x40x23x40xbf16> loc(#loc255)
+    } -> tensor<1x40x23x40xbf16> loc(#loc255)
+    %406 = xten_nn.subgraph (%arg5 = %397: tensor<1x40x23x40xbf16>, %arg6 = %405: tensor<1x40x23x40xbf16>)  attributes {
+      Axis = 1 : i32,
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Concat_381",
+      Op = "Concat",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm1",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm2",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Concat_381",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "PseudoOp",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+        }
+      ],
+      current_data_format = "NCHW",
+      data_format = "HCWN"} {
+      %461 = tosa.concat %arg5, %arg6 {
+        LayerName = "Concat_381",
+        OutputName = "Concat_381",
+        axis = 1 : i32} : (tensor<1x40x23x40xbf16>, tensor<1x40x23x40xbf16>) -> tensor<1x80x23x40xbf16> loc(#loc256)
+      xten_nn.output %461 : tensor<1x80x23x40xbf16> loc(#loc256)
+    } -> tensor<1x80x23x40xbf16> loc(#loc256)
+    %407 = xten_nn.subgraph (%arg5 = %406: tensor<1x80x23x40xbf16>, %arg6 = %22: tensor<40x80x3x3xbf16>, %arg7 = %21: tensor<40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_382",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[40, 80, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_382",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x80x23x40xbf16>, %arg9 = %arg6: tensor<40x80x3x3xbf16>, %arg10 = %arg7: tensor<40xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_382",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[40, 80, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_382",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 3 : ui8,
+          config.ksize.width = 3 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = tosa.transpose %arg9, %463 : (tensor<40x80x3x3xbf16>, tensor<4xi32>) -> tensor<40x3x3x80xbf16> loc(#loc257)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x80x23x40xbf16>, tensor<4xi32>) -> tensor<1x23x40x80xbf16> loc(#loc257)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_382",
+          PartOfOutputName = "Conv_382",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x23x40x80xbf16>, tensor<40x3x3x80xbf16>, tensor<40xbf16>) -> tensor<1x23x40x40xbf16> loc(#loc257)
+        %467 = tosa.transpose %466, %462 : (tensor<1x23x40x40xbf16>, tensor<4xi32>) -> tensor<1x40x23x40xbf16> loc(#loc257)
+        xten_nn.output %467 : tensor<1x40x23x40xbf16> loc(#loc257)
+      } -> tensor<1x40x23x40xbf16> loc(#loc257)
+      xten_nn.output %461 : tensor<1x40x23x40xbf16> loc(#loc257)
+    } -> tensor<1x40x23x40xbf16> loc(#loc257)
+    %408 = xten_nn.subgraph (%arg5 = %407: tensor<1x40x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Tanh_383",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Tanh_383",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x40x23x40xbf16>)  attributes {
+        LayerName = "Tanh_383",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Tanh_383",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "TanhTemplatedBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.ENABLE_FP16_AS_BF16 = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.tanh %arg6 {LayerName = "Tanh_383", OutputName = "Tanh_383"} : (tensor<1x40x23x40xbf16>) -> tensor<1x40x23x40xbf16> loc(#loc258)
+        xten_nn.output %462 : tensor<1x40x23x40xbf16> loc(#loc258)
+      } -> tensor<1x40x23x40xbf16> loc(#loc258)
+      xten_nn.output %461 : tensor<1x40x23x40xbf16> loc(#loc258)
+    } -> tensor<1x40x23x40xbf16> loc(#loc258)
+    %409 = xten_nn.subgraph (%arg5 = %401: tensor<1x40x23x40xbf16>, %arg6 = %408: tensor<1x40x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_387",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_387",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x40x23x40xbf16>, %arg8 = %arg6: tensor<1x40x23x40xbf16>)  attributes {
+        LayerName = "Mul_387",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_387",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_387",
+          OutputName = "Mul_387",
+          shift = 0 : i8} : (tensor<1x40x23x40xbf16>, tensor<1x40x23x40xbf16>) -> tensor<1x40x23x40xbf16> loc(#loc259)
+        xten_nn.output %462 : tensor<1x40x23x40xbf16> loc(#loc259)
+      } -> tensor<1x40x23x40xbf16> loc(#loc259)
+      xten_nn.output %461 : tensor<1x40x23x40xbf16> loc(#loc259)
+    } -> tensor<1x40x23x40xbf16> loc(#loc259)
+    %410 = xten_nn.subgraph (%arg5 = %403: tensor<1x40x23x40xbf16>, %arg6 = %409: tensor<1x40x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Add_388",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_388",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x40x23x40xbf16>, %arg8 = %arg6: tensor<1x40x23x40xbf16>)  attributes {
+        LayerName = "Add_388",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_388",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.act = 0 : ui8,
+          config.act_type = "LINEAR",
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.add %arg7, %arg8 {LayerName = "Add_388", OutputName = "Add_388"} : (tensor<1x40x23x40xbf16>, tensor<1x40x23x40xbf16>) -> tensor<1x40x23x40xbf16> loc(#loc261)
+        xten_nn.output %462 : tensor<1x40x23x40xbf16> loc(#loc261)
+      } -> tensor<1x40x23x40xbf16> loc(#loc261)
+      xten_nn.output %461 : tensor<1x40x23x40xbf16> loc(#loc261)
+    } -> tensor<1x40x23x40xbf16> loc(#loc261)
+    %411 = xten_nn.subgraph (%arg5 = %396: tensor<1x40x23x40xbf16>, %arg6 = %410: tensor<1x40x23x40xbf16>)  attributes {
+      Axis = 1 : i32,
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Concat_389",
+      Op = "Concat",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm1",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm2",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Concat_389",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "PseudoOp",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+        }
+      ],
+      current_data_format = "NCHW",
+      data_format = "HCWN"} {
+      %461 = tosa.concat %arg5, %arg6 {
+        LayerName = "Concat_389",
+        OutputName = "Concat_389",
+        axis = 1 : i32} : (tensor<1x40x23x40xbf16>, tensor<1x40x23x40xbf16>) -> tensor<1x80x23x40xbf16> loc(#loc262)
+      xten_nn.output %461 : tensor<1x80x23x40xbf16> loc(#loc262)
+    } -> tensor<1x80x23x40xbf16> loc(#loc262)
+    %412 = xten_nn.subgraph (%arg5 = %411: tensor<1x80x23x40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Resize_391",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 23, 40]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Resize_391",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 46, 80]> : vector<4xindex>
+        }
+      ],
+      Specializes = "ResizeAdf",
+      With = {
+        config.co_trans_mode = 1 : ui32,
+        config.dim_0 = 1 : ui32,
+        config.dim_1 = 80 : ui32,
+        config.dim_2 = 23 : ui32,
+        config.dim_3 = 40 : ui32,
+        config.dtype = "bfloat16",
+        config.mode = 1 : ui32,
+        config.nearest_mode = 0 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.output_H = 46 : ui32,
+        config.output_W = 80 : ui32
+      }} {
+      %461 = xten_nn.resize %arg5 {
+        LayerName = "Resize_391",
+        OutputName = "Resize_391",
+        coordinate_transformation_mode = 1 : i64,
+        mode = 1 : i64,
+        nearest_mode = 0 : i64,
+        scales = array<f32: 1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00>} : (tensor<1x80x23x40xbf16>) -> tensor<1x80x46x80xbf16> loc(#loc263)
+      xten_nn.output %461 : tensor<1x80x46x80xbf16> loc(#loc263)
+    } -> tensor<1x80x46x80xbf16> loc(#loc263)
+    %413 = xten_nn.subgraph (%arg5 = %412: tensor<1x80x46x80xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Slice_397",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 46, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Slice_397",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 45, 80]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "H",
+        config.dim_c = 80 : ui32,
+        config.dim_h = 46 : ui32,
+        config.dim_w = 80 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 45 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 0 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        LayerName = "Slice_397",
+        OutputName = "Slice_397",
+        size = array<i64: 1, 80, 45, 80>,
+        start = array<i64: 0, 0, 0, 0>} : (tensor<1x80x46x80xbf16>) -> tensor<1x80x45x80xbf16> loc(#loc264)
+      xten_nn.output %461 : tensor<1x80x45x80xbf16> loc(#loc264)
+    } -> tensor<1x80x45x80xbf16> loc(#loc264)
+    %414 = xten_nn.subgraph (%arg5 = %413: tensor<1x80x45x80xbf16>, %arg6 = %181: tensor<1x24x45x80xbf16>, %arg7 = %392: tensor<1x3x45x80xbf16>)  attributes {
+      Axis = 1 : i32,
+      IfmOperands = [0 : index, 1 : index, 2 : index],
+      LayerName = "Concat_398",
+      Op = "Concat",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm1",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 80, 45, 80]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm2",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 24, 45, 80]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm3",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Concat_398",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "PseudoOp",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 107, 45, 80]> : vector<4xindex>
+        }
+      ],
+      current_data_format = "NCHW",
+      data_format = "HCWN"} {
+      %461 = tosa.concat %arg5, %arg6, %arg7 {
+        LayerName = "Concat_398",
+        OutputName = "Concat_398",
+        axis = 1 : i32} : (tensor<1x80x45x80xbf16>, tensor<1x24x45x80xbf16>, tensor<1x3x45x80xbf16>) -> tensor<1x107x45x80xbf16> loc(#loc265)
+      xten_nn.output %461 : tensor<1x107x45x80xbf16> loc(#loc265)
+    } -> tensor<1x107x45x80xbf16> loc(#loc265)
+    %415 = xten_nn.subgraph (%arg5 = %414: tensor<1x107x45x80xbf16>, %arg6 = %19: tensor<40x107x3x3xbf16>, %arg7 = %18: tensor<40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_399",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 107, 45, 80]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[40, 107, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_400",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x107x45x80xbf16>, %arg9 = %arg6: tensor<40x107x3x3xbf16>, %arg10 = %arg7: tensor<40xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_399",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 107, 45, 80]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[40, 107, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_400",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 3 : ui8,
+          config.ksize.width = 3 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = tosa.transpose %arg9, %463 : (tensor<40x107x3x3xbf16>, tensor<4xi32>) -> tensor<40x3x3x107xbf16> loc(#loc350)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x107x45x80xbf16>, tensor<4xi32>) -> tensor<1x45x80x107xbf16> loc(#loc350)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_399",
+          PartOfOutputName = "Conv_399",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x45x80x107xbf16>, tensor<40x3x3x107xbf16>, tensor<40xbf16>) -> tensor<1x45x80x40xbf16> loc(#loc266)
+        %467 = tosa.clamp %466 {
+          LayerName = "Relu_400",
+          OutputName = "Relu_400",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x45x80x40xbf16>) -> tensor<1x45x80x40xbf16> loc(#loc267)
+        %468 = tosa.transpose %467, %462 : (tensor<1x45x80x40xbf16>, tensor<4xi32>) -> tensor<1x40x45x80xbf16> loc(#loc350)
+        xten_nn.output %468 : tensor<1x40x45x80xbf16> loc(#loc267)
+      } -> tensor<1x40x45x80xbf16> loc(#loc350)
+      xten_nn.output %461 : tensor<1x40x45x80xbf16> loc(#loc350)
+    } -> tensor<1x40x45x80xbf16> loc(#loc350)
+    %416 = xten_nn.subgraph (%arg5 = %415: tensor<1x40x45x80xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_401_Duplicated#0",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_401_Duplicated#0",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 40 : ui32,
+        config.dim_h = 45 : ui32,
+        config.dim_w = 80 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 20 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 0 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_401",
+        PartOfOutputName = "Split_401",
+        size = array<i64: 1, 20, 45, 80>,
+        start = array<i64: 0, 0, 0, 0>} : (tensor<1x40x45x80xbf16>) -> tensor<1x20x45x80xbf16> loc(#loc268)
+      xten_nn.output %461 : tensor<1x20x45x80xbf16> loc(#loc268)
+    } -> tensor<1x20x45x80xbf16> loc(#loc268)
+    %417 = xten_nn.subgraph (%arg5 = %415: tensor<1x40x45x80xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_401_Duplicated#1",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_401_Duplicated#1",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 40 : ui32,
+        config.dim_h = 45 : ui32,
+        config.dim_w = 80 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 40 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 20 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_401",
+        PartOfOutputName = "Split_401",
+        size = array<i64: 1, 20, 45, 80>,
+        start = array<i64: 0, 20, 0, 0>} : (tensor<1x40x45x80xbf16>) -> tensor<1x20x45x80xbf16> loc(#loc268)
+      xten_nn.output %461 : tensor<1x20x45x80xbf16> loc(#loc268)
+    } -> tensor<1x20x45x80xbf16> loc(#loc268)
+    %418 = xten_nn.subgraph (%arg5 = %417: tensor<1x20x45x80xbf16>, %arg6 = %arg2: tensor<1x20x45x80xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Concat_402",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm1",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm2",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Concat_402",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+        }
+      ],
+      Specializes = "ConcatC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dtype = "bfloat16",
+        config.in1_dim_c = 24 : ui32,
+        config.in1_dim_h = 45 : ui32,
+        config.in1_dim_w = 80 : ui32,
+        config.in2_dim_c = 24 : ui32,
+        config.in2_dim_h = 45 : ui32,
+        config.in2_dim_w = 80 : ui32,
+        config.num_eff_concat_input0_size = 20 : ui32,
+        config.num_eff_concat_input0_start = 0 : ui32,
+        config.num_eff_concat_input1_size = 20 : ui32,
+        config.num_eff_concat_input1_start = 0 : ui32
+      }} {
+      %461 = tosa.concat %arg5, %arg6 {
+        LayerName = "Concat_402",
+        OutputName = "Concat_402",
+        axis = 1 : i32} : (tensor<1x20x45x80xbf16>, tensor<1x20x45x80xbf16>) -> tensor<1x40x45x80xbf16> loc(#loc269)
+      xten_nn.output %461 : tensor<1x40x45x80xbf16> loc(#loc269)
+    } -> tensor<1x40x45x80xbf16> loc(#loc269)
+    %419 = xten_nn.subgraph (%arg5 = %418: tensor<1x40x45x80xbf16>, %arg6 = %17: tensor<40x40x3x3xbf16>, %arg7 = %16: tensor<40xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_403",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[40, 40, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_403",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x40x45x80xbf16>, %arg9 = %arg6: tensor<40x40x3x3xbf16>, %arg10 = %arg7: tensor<40xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_403",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[40, 40, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_403",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 3 : ui8,
+          config.ksize.width = 3 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = tosa.transpose %arg9, %463 : (tensor<40x40x3x3xbf16>, tensor<4xi32>) -> tensor<40x3x3x40xbf16> loc(#loc270)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x40x45x80xbf16>, tensor<4xi32>) -> tensor<1x45x80x40xbf16> loc(#loc270)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_403",
+          PartOfOutputName = "Conv_403",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x45x80x40xbf16>, tensor<40x3x3x40xbf16>, tensor<40xbf16>) -> tensor<1x45x80x40xbf16> loc(#loc270)
+        %467 = tosa.transpose %466, %462 : (tensor<1x45x80x40xbf16>, tensor<4xi32>) -> tensor<1x40x45x80xbf16> loc(#loc270)
+        xten_nn.output %467 : tensor<1x40x45x80xbf16> loc(#loc270)
+      } -> tensor<1x40x45x80xbf16> loc(#loc270)
+      xten_nn.output %461 : tensor<1x40x45x80xbf16> loc(#loc270)
+    } -> tensor<1x40x45x80xbf16> loc(#loc270)
+    %420 = xten_nn.subgraph (%arg5 = %419: tensor<1x40x45x80xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Sigmoid_404",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Sigmoid_404",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x40x45x80xbf16>)  attributes {
+        LayerName = "Sigmoid_404",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Sigmoid_404",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "SigmoidTemplatedBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.ENABLE_FP16_AS_BF16 = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.sigmoid %arg6 {LayerName = "Sigmoid_404", OutputName = "Sigmoid_404"} : (tensor<1x40x45x80xbf16>) -> tensor<1x40x45x80xbf16> loc(#loc271)
+        xten_nn.output %462 : tensor<1x40x45x80xbf16> loc(#loc271)
+      } -> tensor<1x40x45x80xbf16> loc(#loc271)
+      xten_nn.output %461 : tensor<1x40x45x80xbf16> loc(#loc271)
+    } -> tensor<1x40x45x80xbf16> loc(#loc271)
+    %421 = xten_nn.subgraph (%arg5 = %420: tensor<1x40x45x80xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_405_Duplicated#1",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_405_Duplicated#1",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 40 : ui32,
+        config.dim_h = 45 : ui32,
+        config.dim_w = 80 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 40 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 20 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_405",
+        PartOfOutputName = "Split_405",
+        size = array<i64: 1, 20, 45, 80>,
+        start = array<i64: 0, 20, 0, 0>} : (tensor<1x40x45x80xbf16>) -> tensor<1x20x45x80xbf16> loc(#loc272)
+      xten_nn.output %461 : tensor<1x20x45x80xbf16> loc(#loc272)
+    } -> tensor<1x20x45x80xbf16> loc(#loc272)
+    %422 = xten_nn.subgraph (%arg5 = %13: tensor<1x20x45x80xbf16>, %arg6 = %421: tensor<1x20x45x80xbf16>)  attributes {
+      IfmOperands = [1 : index],
+      LayerName = "Sub_411",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Sub_411",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x20x45x80xbf16>, %arg8 = %arg6: tensor<1x20x45x80xbf16>)  attributes {
+        LayerName = "Sub_411",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Sub_411",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "SubBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.sub %arg7, %arg8 {LayerName = "Sub_411", OutputName = "Sub_411"} : (tensor<1x20x45x80xbf16>, tensor<1x20x45x80xbf16>) -> tensor<1x20x45x80xbf16> loc(#loc3)
+        xten_nn.output %462 : tensor<1x20x45x80xbf16> loc(#loc3)
+      } -> tensor<1x20x45x80xbf16> loc(#loc3)
+      xten_nn.output %461 : tensor<1x20x45x80xbf16> loc(#loc3)
+    } -> tensor<1x20x45x80xbf16> loc(#loc3)
+    %423 = xten_nn.subgraph (%arg5 = %422: tensor<1x20x45x80xbf16>, %arg6 = %arg2: tensor<1x20x45x80xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_412",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_412",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x20x45x80xbf16>, %arg8 = %arg6: tensor<1x20x45x80xbf16>)  attributes {
+        LayerName = "Mul_412",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_412",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_412",
+          OutputName = "Mul_412",
+          shift = 0 : i8} : (tensor<1x20x45x80xbf16>, tensor<1x20x45x80xbf16>) -> tensor<1x20x45x80xbf16> loc(#loc278)
+        xten_nn.output %462 : tensor<1x20x45x80xbf16> loc(#loc278)
+      } -> tensor<1x20x45x80xbf16> loc(#loc278)
+      xten_nn.output %461 : tensor<1x20x45x80xbf16> loc(#loc278)
+    } -> tensor<1x20x45x80xbf16> loc(#loc278)
+    %424 = xten_nn.subgraph (%arg5 = %420: tensor<1x40x45x80xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_405_Duplicated#0",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_405_Duplicated#0",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 40 : ui32,
+        config.dim_h = 45 : ui32,
+        config.dim_w = 80 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 20 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 0 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_405",
+        PartOfOutputName = "Split_405",
+        size = array<i64: 1, 20, 45, 80>,
+        start = array<i64: 0, 0, 0, 0>} : (tensor<1x40x45x80xbf16>) -> tensor<1x20x45x80xbf16> loc(#loc272)
+      xten_nn.output %461 : tensor<1x20x45x80xbf16> loc(#loc272)
+    } -> tensor<1x20x45x80xbf16> loc(#loc272)
+    %425 = xten_nn.subgraph (%arg5 = %424: tensor<1x20x45x80xbf16>, %arg6 = %arg2: tensor<1x20x45x80xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_406",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_406",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x20x45x80xbf16>, %arg8 = %arg6: tensor<1x20x45x80xbf16>)  attributes {
+        LayerName = "Mul_406",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_406",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_406",
+          OutputName = "Mul_406",
+          shift = 0 : i8} : (tensor<1x20x45x80xbf16>, tensor<1x20x45x80xbf16>) -> tensor<1x20x45x80xbf16> loc(#loc273)
+        xten_nn.output %462 : tensor<1x20x45x80xbf16> loc(#loc273)
+      } -> tensor<1x20x45x80xbf16> loc(#loc273)
+      xten_nn.output %461 : tensor<1x20x45x80xbf16> loc(#loc273)
+    } -> tensor<1x20x45x80xbf16> loc(#loc273)
+    %426 = xten_nn.subgraph (%arg5 = %417: tensor<1x20x45x80xbf16>, %arg6 = %425: tensor<1x20x45x80xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Concat_407",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm1",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm2",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Concat_407",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+        }
+      ],
+      Specializes = "ConcatC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dtype = "bfloat16",
+        config.in1_dim_c = 24 : ui32,
+        config.in1_dim_h = 45 : ui32,
+        config.in1_dim_w = 80 : ui32,
+        config.in2_dim_c = 24 : ui32,
+        config.in2_dim_h = 45 : ui32,
+        config.in2_dim_w = 80 : ui32,
+        config.num_eff_concat_input0_size = 20 : ui32,
+        config.num_eff_concat_input0_start = 0 : ui32,
+        config.num_eff_concat_input1_size = 20 : ui32,
+        config.num_eff_concat_input1_start = 0 : ui32
+      }} {
+      %461 = tosa.concat %arg5, %arg6 {
+        LayerName = "Concat_407",
+        OutputName = "Concat_407",
+        axis = 1 : i32} : (tensor<1x20x45x80xbf16>, tensor<1x20x45x80xbf16>) -> tensor<1x40x45x80xbf16> loc(#loc274)
+      xten_nn.output %461 : tensor<1x40x45x80xbf16> loc(#loc274)
+    } -> tensor<1x40x45x80xbf16> loc(#loc274)
+    %427 = xten_nn.subgraph (%arg5 = %426: tensor<1x40x45x80xbf16>, %arg6 = %15: tensor<20x40x3x3xbf16>, %arg7 = %14: tensor<20xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_408",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<[4, 0, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[20, 40, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_408",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x40x45x80xbf16>, %arg9 = %arg6: tensor<20x40x3x3xbf16>, %arg10 = %arg7: tensor<20xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_408",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<[4, 0, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[20, 40, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_408",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 3 : ui8,
+          config.ksize.width = 3 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = tosa.transpose %arg9, %463 : (tensor<20x40x3x3xbf16>, tensor<4xi32>) -> tensor<20x3x3x40xbf16> loc(#loc275)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x40x45x80xbf16>, tensor<4xi32>) -> tensor<1x45x80x40xbf16> loc(#loc275)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_408",
+          PartOfOutputName = "Conv_408",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x45x80x40xbf16>, tensor<20x3x3x40xbf16>, tensor<20xbf16>) -> tensor<1x45x80x20xbf16> loc(#loc275)
+        %467 = tosa.transpose %466, %462 : (tensor<1x45x80x20xbf16>, tensor<4xi32>) -> tensor<1x20x45x80xbf16> loc(#loc275)
+        xten_nn.output %467 : tensor<1x20x45x80xbf16> loc(#loc275)
+      } -> tensor<1x20x45x80xbf16> loc(#loc275)
+      xten_nn.output %461 : tensor<1x20x45x80xbf16> loc(#loc275)
+    } -> tensor<1x20x45x80xbf16> loc(#loc275)
+    %428 = xten_nn.subgraph (%arg5 = %427: tensor<1x20x45x80xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Tanh_409",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Tanh_409",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x20x45x80xbf16>)  attributes {
+        LayerName = "Tanh_409",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Tanh_409",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "TanhTemplatedBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.ENABLE_FP16_AS_BF16 = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.tanh %arg6 {LayerName = "Tanh_409", OutputName = "Tanh_409"} : (tensor<1x20x45x80xbf16>) -> tensor<1x20x45x80xbf16> loc(#loc276)
+        xten_nn.output %462 : tensor<1x20x45x80xbf16> loc(#loc276)
+      } -> tensor<1x20x45x80xbf16> loc(#loc276)
+      xten_nn.output %461 : tensor<1x20x45x80xbf16> loc(#loc276)
+    } -> tensor<1x20x45x80xbf16> loc(#loc276)
+    %429 = xten_nn.subgraph (%arg5 = %421: tensor<1x20x45x80xbf16>, %arg6 = %428: tensor<1x20x45x80xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_413",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_413",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x20x45x80xbf16>, %arg8 = %arg6: tensor<1x20x45x80xbf16>)  attributes {
+        LayerName = "Mul_413",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_413",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_413",
+          OutputName = "Mul_413",
+          shift = 0 : i8} : (tensor<1x20x45x80xbf16>, tensor<1x20x45x80xbf16>) -> tensor<1x20x45x80xbf16> loc(#loc277)
+        xten_nn.output %462 : tensor<1x20x45x80xbf16> loc(#loc277)
+      } -> tensor<1x20x45x80xbf16> loc(#loc277)
+      xten_nn.output %461 : tensor<1x20x45x80xbf16> loc(#loc277)
+    } -> tensor<1x20x45x80xbf16> loc(#loc277)
+    %430 = xten_nn.subgraph (%arg5 = %423: tensor<1x20x45x80xbf16>, %arg6 = %429: tensor<1x20x45x80xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Add_414",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_414",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x20x45x80xbf16>, %arg8 = %arg6: tensor<1x20x45x80xbf16>)  attributes {
+        LayerName = "Add_414",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_414",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.act = 0 : ui8,
+          config.act_type = "LINEAR",
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.add %arg7, %arg8 {LayerName = "Add_414", OutputName = "Add_414"} : (tensor<1x20x45x80xbf16>, tensor<1x20x45x80xbf16>) -> tensor<1x20x45x80xbf16> loc(#loc279)
+        xten_nn.output %462 : tensor<1x20x45x80xbf16> loc(#loc279)
+      } -> tensor<1x20x45x80xbf16> loc(#loc279)
+      xten_nn.output %461 : tensor<1x20x45x80xbf16> loc(#loc279)
+    } -> tensor<1x20x45x80xbf16> loc(#loc279)
+    %431 = xten_nn.subgraph (%arg5 = %416: tensor<1x20x45x80xbf16>, %arg6 = %430: tensor<1x20x45x80xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Concat_415",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm1",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm2",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 20, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Concat_415",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+        }
+      ],
+      Specializes = "ConcatC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.dtype = "bfloat16",
+        config.in1_dim_c = 24 : ui32,
+        config.in1_dim_h = 45 : ui32,
+        config.in1_dim_w = 80 : ui32,
+        config.in2_dim_c = 24 : ui32,
+        config.in2_dim_h = 45 : ui32,
+        config.in2_dim_w = 80 : ui32,
+        config.num_eff_concat_input0_size = 20 : ui32,
+        config.num_eff_concat_input0_start = 0 : ui32,
+        config.num_eff_concat_input1_size = 20 : ui32,
+        config.num_eff_concat_input1_start = 0 : ui32
+      }} {
+      %461 = tosa.concat %arg5, %arg6 {
+        LayerName = "Concat_415",
+        OutputName = "Concat_415",
+        axis = 1 : i32} : (tensor<1x20x45x80xbf16>, tensor<1x20x45x80xbf16>) -> tensor<1x40x45x80xbf16> loc(#loc280)
+      xten_nn.output %461 : tensor<1x40x45x80xbf16> loc(#loc280)
+    } -> tensor<1x40x45x80xbf16> loc(#loc280)
+    %432 = xten_nn.subgraph (%arg5 = %431: tensor<1x40x45x80xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Resize_417",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 45, 80]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Resize_417",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 90, 160]> : vector<4xindex>
+        }
+      ],
+      Specializes = "ResizeAdf",
+      With = {
+        config.co_trans_mode = 1 : ui32,
+        config.dim_0 = 1 : ui32,
+        config.dim_1 = 40 : ui32,
+        config.dim_2 = 45 : ui32,
+        config.dim_3 = 80 : ui32,
+        config.dtype = "bfloat16",
+        config.mode = 1 : ui32,
+        config.nearest_mode = 0 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.output_H = 90 : ui32,
+        config.output_W = 160 : ui32
+      }} {
+      %461 = xten_nn.resize %arg5 {
+        LayerName = "Resize_417",
+        OutputName = "Resize_417",
+        coordinate_transformation_mode = 1 : i64,
+        mode = 1 : i64,
+        nearest_mode = 0 : i64,
+        scales = array<f32: 1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00>} : (tensor<1x40x45x80xbf16>) -> tensor<1x40x90x160xbf16> loc(#loc281)
+      xten_nn.output %461 : tensor<1x40x90x160xbf16> loc(#loc281)
+    } -> tensor<1x40x90x160xbf16> loc(#loc281)
+    %433 = xten_nn.subgraph (%arg5 = %432: tensor<1x40x90x160xbf16>, %arg6 = %175: tensor<1x16x90x160xbf16>, %arg7 = %391: tensor<1x3x90x160xbf16>)  attributes {
+      Axis = 1 : i32,
+      IfmOperands = [0 : index, 1 : index, 2 : index],
+      LayerName = "Concat_418",
+      Op = "Concat",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm1",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 40, 90, 160]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm2",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm3",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Concat_418",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "PseudoOp",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 59, 90, 160]> : vector<4xindex>
+        }
+      ],
+      current_data_format = "NCHW",
+      data_format = "HCWN"} {
+      %461 = tosa.concat %arg5, %arg6, %arg7 {
+        LayerName = "Concat_418",
+        OutputName = "Concat_418",
+        axis = 1 : i32} : (tensor<1x40x90x160xbf16>, tensor<1x16x90x160xbf16>, tensor<1x3x90x160xbf16>) -> tensor<1x59x90x160xbf16> loc(#loc282)
+      xten_nn.output %461 : tensor<1x59x90x160xbf16> loc(#loc282)
+    } -> tensor<1x59x90x160xbf16> loc(#loc282)
+    %434 = xten_nn.subgraph (%arg5 = %433: tensor<1x59x90x160xbf16>, %arg6 = %12: tensor<32x59x3x3xbf16>, %arg7 = %11: tensor<32xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_419",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 59, 90, 160]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[32, 59, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_420",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x59x90x160xbf16>, %arg9 = %arg6: tensor<32x59x3x3xbf16>, %arg10 = %arg7: tensor<32xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_419",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 59, 90, 160]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[32, 59, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_420",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 3 : ui8,
+          config.ksize.width = 3 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = tosa.transpose %arg9, %463 : (tensor<32x59x3x3xbf16>, tensor<4xi32>) -> tensor<32x3x3x59xbf16> loc(#loc351)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x59x90x160xbf16>, tensor<4xi32>) -> tensor<1x90x160x59xbf16> loc(#loc351)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_419",
+          PartOfOutputName = "Conv_419",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x90x160x59xbf16>, tensor<32x3x3x59xbf16>, tensor<32xbf16>) -> tensor<1x90x160x32xbf16> loc(#loc283)
+        %467 = tosa.clamp %466 {
+          LayerName = "Relu_420",
+          OutputName = "Relu_420",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x90x160x32xbf16>) -> tensor<1x90x160x32xbf16> loc(#loc284)
+        %468 = tosa.transpose %467, %462 : (tensor<1x90x160x32xbf16>, tensor<4xi32>) -> tensor<1x32x90x160xbf16> loc(#loc351)
+        xten_nn.output %468 : tensor<1x32x90x160xbf16> loc(#loc284)
+      } -> tensor<1x32x90x160xbf16> loc(#loc351)
+      xten_nn.output %461 : tensor<1x32x90x160xbf16> loc(#loc351)
+    } -> tensor<1x32x90x160xbf16> loc(#loc351)
+    %435 = xten_nn.subgraph (%arg5 = %434: tensor<1x32x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_421_Duplicated#0",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_421_Duplicated#0",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 32 : ui32,
+        config.dim_h = 90 : ui32,
+        config.dim_w = 160 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 16 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 0 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_421",
+        PartOfOutputName = "Split_421",
+        size = array<i64: 1, 16, 90, 160>,
+        start = array<i64: 0, 0, 0, 0>} : (tensor<1x32x90x160xbf16>) -> tensor<1x16x90x160xbf16> loc(#loc285)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc285)
+    } -> tensor<1x16x90x160xbf16> loc(#loc285)
+    %436 = xten_nn.subgraph (%arg5 = %434: tensor<1x32x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_421_Duplicated#1",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_421_Duplicated#1",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 32 : ui32,
+        config.dim_h = 90 : ui32,
+        config.dim_w = 160 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 32 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 16 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_421",
+        PartOfOutputName = "Split_421",
+        size = array<i64: 1, 16, 90, 160>,
+        start = array<i64: 0, 16, 0, 0>} : (tensor<1x32x90x160xbf16>) -> tensor<1x16x90x160xbf16> loc(#loc285)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc285)
+    } -> tensor<1x16x90x160xbf16> loc(#loc285)
+    %437 = xten_nn.subgraph (%arg5 = %436: tensor<1x16x90x160xbf16>, %arg6 = %arg1: tensor<1x16x90x160xbf16>)  attributes {
+      Axis = 1 : i32,
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Concat_422",
+      Op = "Concat",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm1",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm2",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Concat_422",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "PseudoOp",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+        }
+      ],
+      current_data_format = "NCHW",
+      data_format = "HCWN"} {
+      %461 = tosa.concat %arg5, %arg6 {
+        LayerName = "Concat_422",
+        OutputName = "Concat_422",
+        axis = 1 : i32} : (tensor<1x16x90x160xbf16>, tensor<1x16x90x160xbf16>) -> tensor<1x32x90x160xbf16> loc(#loc286)
+      xten_nn.output %461 : tensor<1x32x90x160xbf16> loc(#loc286)
+    } -> tensor<1x32x90x160xbf16> loc(#loc286)
+    %438 = xten_nn.subgraph (%arg5 = %437: tensor<1x32x90x160xbf16>, %arg6 = %10: tensor<32x32x3x3xbf16>, %arg7 = %9: tensor<32xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_423",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[32, 32, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_423",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x32x90x160xbf16>, %arg9 = %arg6: tensor<32x32x3x3xbf16>, %arg10 = %arg7: tensor<32xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_423",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[32, 32, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_423",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 3 : ui8,
+          config.ksize.width = 3 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = tosa.transpose %arg9, %463 : (tensor<32x32x3x3xbf16>, tensor<4xi32>) -> tensor<32x3x3x32xbf16> loc(#loc287)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x32x90x160xbf16>, tensor<4xi32>) -> tensor<1x90x160x32xbf16> loc(#loc287)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_423",
+          PartOfOutputName = "Conv_423",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x90x160x32xbf16>, tensor<32x3x3x32xbf16>, tensor<32xbf16>) -> tensor<1x90x160x32xbf16> loc(#loc287)
+        %467 = tosa.transpose %466, %462 : (tensor<1x90x160x32xbf16>, tensor<4xi32>) -> tensor<1x32x90x160xbf16> loc(#loc287)
+        xten_nn.output %467 : tensor<1x32x90x160xbf16> loc(#loc287)
+      } -> tensor<1x32x90x160xbf16> loc(#loc287)
+      xten_nn.output %461 : tensor<1x32x90x160xbf16> loc(#loc287)
+    } -> tensor<1x32x90x160xbf16> loc(#loc287)
+    %439 = xten_nn.subgraph (%arg5 = %438: tensor<1x32x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Sigmoid_424",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Sigmoid_424",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x32x90x160xbf16>)  attributes {
+        LayerName = "Sigmoid_424",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Sigmoid_424",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "SigmoidTemplatedBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.ENABLE_FP16_AS_BF16 = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.sigmoid %arg6 {LayerName = "Sigmoid_424", OutputName = "Sigmoid_424"} : (tensor<1x32x90x160xbf16>) -> tensor<1x32x90x160xbf16> loc(#loc288)
+        xten_nn.output %462 : tensor<1x32x90x160xbf16> loc(#loc288)
+      } -> tensor<1x32x90x160xbf16> loc(#loc288)
+      xten_nn.output %461 : tensor<1x32x90x160xbf16> loc(#loc288)
+    } -> tensor<1x32x90x160xbf16> loc(#loc288)
+    %440 = xten_nn.subgraph (%arg5 = %439: tensor<1x32x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_425_Duplicated#1",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_425_Duplicated#1",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 32 : ui32,
+        config.dim_h = 90 : ui32,
+        config.dim_w = 160 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 32 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 16 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_425",
+        PartOfOutputName = "Split_425",
+        size = array<i64: 1, 16, 90, 160>,
+        start = array<i64: 0, 16, 0, 0>} : (tensor<1x32x90x160xbf16>) -> tensor<1x16x90x160xbf16> loc(#loc289)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc289)
+    } -> tensor<1x16x90x160xbf16> loc(#loc289)
+    %441 = xten_nn.subgraph (%arg5 = %6: tensor<1x16x90x160xbf16>, %arg6 = %440: tensor<1x16x90x160xbf16>)  attributes {
+      IfmOperands = [1 : index],
+      LayerName = "Sub_431",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Sub_431",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x16x90x160xbf16>, %arg8 = %arg6: tensor<1x16x90x160xbf16>)  attributes {
+        LayerName = "Sub_431",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Sub_431",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "SubBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.sub %arg7, %arg8 {LayerName = "Sub_431", OutputName = "Sub_431"} : (tensor<1x16x90x160xbf16>, tensor<1x16x90x160xbf16>) -> tensor<1x16x90x160xbf16> loc(#loc2)
+        xten_nn.output %462 : tensor<1x16x90x160xbf16> loc(#loc2)
+      } -> tensor<1x16x90x160xbf16> loc(#loc2)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc2)
+    } -> tensor<1x16x90x160xbf16> loc(#loc2)
+    %442 = xten_nn.subgraph (%arg5 = %441: tensor<1x16x90x160xbf16>, %arg6 = %arg1: tensor<1x16x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_432",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_432",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x16x90x160xbf16>, %arg8 = %arg6: tensor<1x16x90x160xbf16>)  attributes {
+        LayerName = "Mul_432",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_432",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_432",
+          OutputName = "Mul_432",
+          shift = 0 : i8} : (tensor<1x16x90x160xbf16>, tensor<1x16x90x160xbf16>) -> tensor<1x16x90x160xbf16> loc(#loc295)
+        xten_nn.output %462 : tensor<1x16x90x160xbf16> loc(#loc295)
+      } -> tensor<1x16x90x160xbf16> loc(#loc295)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc295)
+    } -> tensor<1x16x90x160xbf16> loc(#loc295)
+    %443 = xten_nn.subgraph (%arg5 = %439: tensor<1x32x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_425_Duplicated#0",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_425_Duplicated#0",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 32 : ui32,
+        config.dim_h = 90 : ui32,
+        config.dim_w = 160 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 16 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 0 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_425",
+        PartOfOutputName = "Split_425",
+        size = array<i64: 1, 16, 90, 160>,
+        start = array<i64: 0, 0, 0, 0>} : (tensor<1x32x90x160xbf16>) -> tensor<1x16x90x160xbf16> loc(#loc289)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc289)
+    } -> tensor<1x16x90x160xbf16> loc(#loc289)
+    %444 = xten_nn.subgraph (%arg5 = %443: tensor<1x16x90x160xbf16>, %arg6 = %arg1: tensor<1x16x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_426",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_426",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x16x90x160xbf16>, %arg8 = %arg6: tensor<1x16x90x160xbf16>)  attributes {
+        LayerName = "Mul_426",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_426",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_426",
+          OutputName = "Mul_426",
+          shift = 0 : i8} : (tensor<1x16x90x160xbf16>, tensor<1x16x90x160xbf16>) -> tensor<1x16x90x160xbf16> loc(#loc290)
+        xten_nn.output %462 : tensor<1x16x90x160xbf16> loc(#loc290)
+      } -> tensor<1x16x90x160xbf16> loc(#loc290)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc290)
+    } -> tensor<1x16x90x160xbf16> loc(#loc290)
+    %445 = xten_nn.subgraph (%arg5 = %436: tensor<1x16x90x160xbf16>, %arg6 = %444: tensor<1x16x90x160xbf16>)  attributes {
+      Axis = 1 : i32,
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Concat_427",
+      Op = "Concat",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm1",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm2",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Concat_427",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "PseudoOp",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+        }
+      ],
+      current_data_format = "NCHW",
+      data_format = "HCWN"} {
+      %461 = tosa.concat %arg5, %arg6 {
+        LayerName = "Concat_427",
+        OutputName = "Concat_427",
+        axis = 1 : i32} : (tensor<1x16x90x160xbf16>, tensor<1x16x90x160xbf16>) -> tensor<1x32x90x160xbf16> loc(#loc291)
+      xten_nn.output %461 : tensor<1x32x90x160xbf16> loc(#loc291)
+    } -> tensor<1x32x90x160xbf16> loc(#loc291)
+    %446 = xten_nn.subgraph (%arg5 = %445: tensor<1x32x90x160xbf16>, %arg6 = %8: tensor<16x32x3x3xbf16>, %arg7 = %7: tensor<16xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_428",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[16, 32, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_428",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x32x90x160xbf16>, %arg9 = %arg6: tensor<16x32x3x3xbf16>, %arg10 = %arg7: tensor<16xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_428",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[16, 32, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_428",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 3 : ui8,
+          config.ksize.width = 3 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = tosa.transpose %arg9, %463 : (tensor<16x32x3x3xbf16>, tensor<4xi32>) -> tensor<16x3x3x32xbf16> loc(#loc292)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x32x90x160xbf16>, tensor<4xi32>) -> tensor<1x90x160x32xbf16> loc(#loc292)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_428",
+          PartOfOutputName = "Conv_428",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x90x160x32xbf16>, tensor<16x3x3x32xbf16>, tensor<16xbf16>) -> tensor<1x90x160x16xbf16> loc(#loc292)
+        %467 = tosa.transpose %466, %462 : (tensor<1x90x160x16xbf16>, tensor<4xi32>) -> tensor<1x16x90x160xbf16> loc(#loc292)
+        xten_nn.output %467 : tensor<1x16x90x160xbf16> loc(#loc292)
+      } -> tensor<1x16x90x160xbf16> loc(#loc292)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc292)
+    } -> tensor<1x16x90x160xbf16> loc(#loc292)
+    %447 = xten_nn.subgraph (%arg5 = %446: tensor<1x16x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Tanh_429",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Tanh_429",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "single", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x16x90x160xbf16>)  attributes {
+        LayerName = "Tanh_429",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Tanh_429",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "TanhTemplatedBf16",
+        Traits = {
+          Elementwise = true,
+          Unary = true
+        },
+        With = {
+          config.ENABLE_FP16_AS_BF16 = 0 : ui8,
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.tanh %arg6 {LayerName = "Tanh_429", OutputName = "Tanh_429"} : (tensor<1x16x90x160xbf16>) -> tensor<1x16x90x160xbf16> loc(#loc293)
+        xten_nn.output %462 : tensor<1x16x90x160xbf16> loc(#loc293)
+      } -> tensor<1x16x90x160xbf16> loc(#loc293)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc293)
+    } -> tensor<1x16x90x160xbf16> loc(#loc293)
+    %448 = xten_nn.subgraph (%arg5 = %440: tensor<1x16x90x160xbf16>, %arg6 = %447: tensor<1x16x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Mul_433",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Mul_433",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x16x90x160xbf16>, %arg8 = %arg6: tensor<1x16x90x160xbf16>)  attributes {
+        LayerName = "Mul_433",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Mul_433",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "MulBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.mul %arg7, %arg8 {
+          LayerName = "Mul_433",
+          OutputName = "Mul_433",
+          shift = 0 : i8} : (tensor<1x16x90x160xbf16>, tensor<1x16x90x160xbf16>) -> tensor<1x16x90x160xbf16> loc(#loc294)
+        xten_nn.output %462 : tensor<1x16x90x160xbf16> loc(#loc294)
+      } -> tensor<1x16x90x160xbf16> loc(#loc294)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc294)
+    } -> tensor<1x16x90x160xbf16> loc(#loc294)
+    %449 = xten_nn.subgraph (%arg5 = %442: tensor<1x16x90x160xbf16>, %arg6 = %448: tensor<1x16x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Add_434",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_434",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x16x90x160xbf16>, %arg8 = %arg6: tensor<1x16x90x160xbf16>)  attributes {
+        LayerName = "Add_434",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_434",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.act = 0 : ui8,
+          config.act_type = "LINEAR",
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.add %arg7, %arg8 {LayerName = "Add_434", OutputName = "Add_434"} : (tensor<1x16x90x160xbf16>, tensor<1x16x90x160xbf16>) -> tensor<1x16x90x160xbf16> loc(#loc296)
+        xten_nn.output %462 : tensor<1x16x90x160xbf16> loc(#loc296)
+      } -> tensor<1x16x90x160xbf16> loc(#loc296)
+      xten_nn.output %461 : tensor<1x16x90x160xbf16> loc(#loc296)
+    } -> tensor<1x16x90x160xbf16> loc(#loc296)
+    %450 = xten_nn.subgraph (%arg5 = %435: tensor<1x16x90x160xbf16>, %arg6 = %449: tensor<1x16x90x160xbf16>)  attributes {
+      Axis = 1 : i32,
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Concat_435",
+      Op = "Concat",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm1",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm2",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Concat_435",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "PseudoOp",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+        }
+      ],
+      current_data_format = "NCHW",
+      data_format = "HCWN"} {
+      %461 = tosa.concat %arg5, %arg6 {
+        LayerName = "Concat_435",
+        OutputName = "Concat_435",
+        axis = 1 : i32} : (tensor<1x16x90x160xbf16>, tensor<1x16x90x160xbf16>) -> tensor<1x32x90x160xbf16> loc(#loc297)
+      xten_nn.output %461 : tensor<1x32x90x160xbf16> loc(#loc297)
+    } -> tensor<1x32x90x160xbf16> loc(#loc297)
+    %451 = xten_nn.subgraph (%arg5 = %450: tensor<1x32x90x160xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Resize_437",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 90, 160]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Resize_437",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 180, 320]> : vector<4xindex>
+        }
+      ],
+      Specializes = "ResizeAdf",
+      With = {
+        config.co_trans_mode = 1 : ui32,
+        config.dim_0 = 1 : ui32,
+        config.dim_1 = 32 : ui32,
+        config.dim_2 = 90 : ui32,
+        config.dim_3 = 160 : ui32,
+        config.dtype = "bfloat16",
+        config.mode = 1 : ui32,
+        config.nearest_mode = 0 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.output_H = 180 : ui32,
+        config.output_W = 320 : ui32
+      }} {
+      %461 = xten_nn.resize %arg5 {
+        LayerName = "Resize_437",
+        OutputName = "Resize_437",
+        coordinate_transformation_mode = 1 : i64,
+        mode = 1 : i64,
+        nearest_mode = 0 : i64,
+        scales = array<f32: 1.000000e+00, 1.000000e+00, 2.000000e+00, 2.000000e+00>} : (tensor<1x32x90x160xbf16>) -> tensor<1x32x180x320xbf16> loc(#loc298)
+      xten_nn.output %461 : tensor<1x32x180x320xbf16> loc(#loc298)
+    } -> tensor<1x32x180x320xbf16> loc(#loc298)
+    %452 = xten_nn.subgraph (%arg5 = %451: tensor<1x32x180x320xbf16>, %arg6 = %166: tensor<1x3x180x320xbf16>)  attributes {
+      Axis = 1 : i32,
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Concat_438",
+      Op = "Concat",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm1",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 32, 180, 320]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm2",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Concat_438",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "PseudoOp",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 35, 180, 320]> : vector<4xindex>
+        }
+      ],
+      current_data_format = "NCHW",
+      data_format = "HCWN"} {
+      %461 = tosa.concat %arg5, %arg6 {
+        LayerName = "Concat_438",
+        OutputName = "Concat_438",
+        axis = 1 : i32} : (tensor<1x32x180x320xbf16>, tensor<1x3x180x320xbf16>) -> tensor<1x35x180x320xbf16> loc(#loc299)
+      xten_nn.output %461 : tensor<1x35x180x320xbf16> loc(#loc299)
+    } -> tensor<1x35x180x320xbf16> loc(#loc299)
+    %453 = xten_nn.subgraph (%arg5 = %452: tensor<1x35x180x320xbf16>, %arg6 = %5: tensor<16x35x3x3xbf16>, %arg7 = %4: tensor<16xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_439",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 35, 180, 320]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[16, 35, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_440",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 180, 320]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x35x180x320xbf16>, %arg9 = %arg6: tensor<16x35x3x3xbf16>, %arg10 = %arg7: tensor<16xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_439",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 35, 180, 320]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[16, 35, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_440",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 180, 320]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 3 : ui8,
+          config.ksize.width = 3 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = tosa.transpose %arg9, %463 : (tensor<16x35x3x3xbf16>, tensor<4xi32>) -> tensor<16x3x3x35xbf16> loc(#loc352)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x35x180x320xbf16>, tensor<4xi32>) -> tensor<1x180x320x35xbf16> loc(#loc352)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_439",
+          PartOfOutputName = "Conv_439",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x180x320x35xbf16>, tensor<16x3x3x35xbf16>, tensor<16xbf16>) -> tensor<1x180x320x16xbf16> loc(#loc300)
+        %467 = tosa.clamp %466 {
+          LayerName = "Relu_440",
+          OutputName = "Relu_440",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x180x320x16xbf16>) -> tensor<1x180x320x16xbf16> loc(#loc301)
+        %468 = tosa.transpose %467, %462 : (tensor<1x180x320x16xbf16>, tensor<4xi32>) -> tensor<1x16x180x320xbf16> loc(#loc352)
+        xten_nn.output %468 : tensor<1x16x180x320xbf16> loc(#loc301)
+      } -> tensor<1x16x180x320xbf16> loc(#loc352)
+      xten_nn.output %461 : tensor<1x16x180x320xbf16> loc(#loc352)
+    } -> tensor<1x16x180x320xbf16> loc(#loc352)
+    %454 = xten_nn.subgraph (%arg5 = %453: tensor<1x16x180x320xbf16>, %arg6 = %3: tensor<16x16x3x3xbf16>, %arg7 = %2: tensor<16xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_441",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 180, 320]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[16, 16, 3, 3]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Relu_442",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 180, 320]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x16x180x320xbf16>, %arg9 = %arg6: tensor<16x16x3x3xbf16>, %arg10 = %arg7: tensor<16xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[1, 1], [1, 1]],
+        LayerName = "Conv_441",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 180, 320]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[16, 16, 3, 3]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Relu_442",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 180, 320]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true,
+          NonNegativeOut = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 1 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 3 : ui8,
+          config.ksize.width = 3 : ui8,
+          config.lrelu_alpha = 0.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 0.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %464 = tosa.transpose %arg9, %463 : (tensor<16x16x3x3xbf16>, tensor<4xi32>) -> tensor<16x3x3x16xbf16> loc(#loc353)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x16x180x320xbf16>, tensor<4xi32>) -> tensor<1x180x320x16xbf16> loc(#loc353)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_441",
+          PartOfOutputName = "Conv_441",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 1, 1, 1, 1>,
+          stride = array<i64: 1, 1>} : (tensor<1x180x320x16xbf16>, tensor<16x3x3x16xbf16>, tensor<16xbf16>) -> tensor<1x180x320x16xbf16> loc(#loc302)
+        %467 = tosa.clamp %466 {
+          LayerName = "Relu_442",
+          OutputName = "Relu_442",
+          max_fp = 3.40282347E+38 : f32,
+          max_int = 2147483647 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x180x320x16xbf16>) -> tensor<1x180x320x16xbf16> loc(#loc303)
+        %468 = tosa.transpose %467, %462 : (tensor<1x180x320x16xbf16>, tensor<4xi32>) -> tensor<1x16x180x320xbf16> loc(#loc353)
+        xten_nn.output %468 : tensor<1x16x180x320xbf16> loc(#loc303)
+      } -> tensor<1x16x180x320xbf16> loc(#loc353)
+      xten_nn.output %461 : tensor<1x16x180x320xbf16> loc(#loc353)
+    } -> tensor<1x16x180x320xbf16> loc(#loc353)
+    %455 = xten_nn.subgraph (%arg5 = %454: tensor<1x16x180x320xbf16>, %arg6 = %1: tensor<4x16x1x1xbf16>, %arg7 = %0: tensor<4xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Conv_443",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 16, 180, 320]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true,
+          l3_extend_end = dense<[4, 0, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[4, 16, 1, 1]> : vector<4xindex>
+        },
+        {
+          UnknownDataFormat = true
+        }
+      ],
+      OutputName = "Conv_443",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 4, 180, 320]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg8 = %arg5: tensor<1x16x180x320xbf16>, %arg9 = %arg6: tensor<4x16x1x1xbf16>, %arg10 = %arg7: tensor<4xbf16>)  attributes {
+        Dilations = array<i64: 1, 1>,
+        HWPadding = [[0, 0], [0, 0]],
+        LayerName = "Conv_443",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<0> : vector<4xindex>,
+            l3_tile_count = dense<[1, 16, 180, 320]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "wts_data",
+            UnknownDataFormat = true,
+            l3_extend_end = dense<[4, 0, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[4, 16, 1, 1]> : vector<4xindex>
+          },
+          {
+            Port = "data_io.wts",
+            SubPort = "bias",
+            UnknownDataFormat = true
+          }
+        ],
+        OutputName = "Conv_443",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 4, 180, 320]> : vector<4xindex>
+          }
+        ],
+        Specializes = "Conv2DBf16",
+        Traits = {
+          AllowDMAOptimization = true
+        },
+        With = {
+          config.AIE_API_EMULATE_BFLOAT16_MMUL_WITH_BFP16 = 1 : ui8,
+          config.act = 0 : ui8,
+          config.act_type = "RELU",
+          config.aie_arch = "aie2p",
+          config.batch_size = 1 : ui8,
+          config.compiler = "chess",
+          config.conv_type = [0 : ui8, 12 : ui8, 64 : ui8],
+          config.dtype_ifm = "bfloat16",
+          config.dtype_ofm = "bfloat16",
+          config.dtype_wts = "bfloat16",
+          config.ksize.height = 1 : ui8,
+          config.ksize.width = 1 : ui8,
+          config.lrelu_alpha = 1.000000e+00 : bf16,
+          config.lrelu_alpha_kernel = 1.000000e+00 : bf16,
+          config.stride_h = 1 : ui8,
+          config.stride_w = 1 : ui8
+        }} {
+        %462 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc)
+        %463 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32> loc(#loc304)
+        %464 = tosa.reshape %arg9 {new_shape = array<i64: 4, 1, 1, 16>} : (tensor<4x16x1x1xbf16>) -> tensor<4x1x1x16xbf16> loc(#loc304)
+        %465 = tosa.transpose %arg8, %463 : (tensor<1x16x180x320xbf16>, tensor<4xi32>) -> tensor<1x180x320x16xbf16> loc(#loc304)
+        %466 = tosa.conv2d %465, %464, %arg10 {
+          PartOfLayerName = "Conv_443",
+          PartOfOutputName = "Conv_443",
+          dilation = array<i64: 1, 1>,
+          pad = array<i64: 0, 0, 0, 0>,
+          stride = array<i64: 1, 1>} : (tensor<1x180x320x16xbf16>, tensor<4x1x1x16xbf16>, tensor<4xbf16>) -> tensor<1x180x320x4xbf16> loc(#loc304)
+        %467 = tosa.transpose %466, %462 : (tensor<1x180x320x4xbf16>, tensor<4xi32>) -> tensor<1x4x180x320xbf16> loc(#loc304)
+        xten_nn.output %467 : tensor<1x4x180x320xbf16> loc(#loc304)
+      } -> tensor<1x4x180x320xbf16> loc(#loc304)
+      xten_nn.output %461 : tensor<1x4x180x320xbf16> loc(#loc304)
+    } -> tensor<1x4x180x320xbf16> loc(#loc304)
+    %456 = xten_nn.subgraph (%arg5 = %455: tensor<1x4x180x320xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_444_Duplicated#1",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 4, 180, 320]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_444_Duplicated#1",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<[0, 7, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 1, 180, 320]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 8 : ui32,
+        config.dim_h = 180 : ui32,
+        config.dim_w = 320 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 4 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 3 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_444",
+        PartOfOutputName = "Split_444",
+        size = array<i64: 1, 1, 180, 320>,
+        start = array<i64: 0, 3, 0, 0>} : (tensor<1x4x180x320xbf16>) -> tensor<1x1x180x320xbf16> loc(#loc305)
+      xten_nn.output %461 : tensor<1x1x180x320xbf16> loc(#loc305)
+    } -> tensor<1x1x180x320xbf16> loc(#loc305)
+    %457 = xten_nn.subgraph (%arg5 = %456: tensor<1x1x180x320xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_447",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 7, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 1, 180, 320]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_447",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 7, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 1, 180, 320]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x1x180x320xbf16>)  attributes {
+        LayerName = "Clip_447",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<[0, 7, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 1, 180, 320]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_447",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 7, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 1, 180, 320]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 1.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_447",
+          OutputName = "Clip_447",
+          max_fp = 1.000000e+00 : f32,
+          max_int = 1 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x1x180x320xbf16>) -> tensor<1x1x180x320xbf16> loc(#loc307)
+        xten_nn.output %462 : tensor<1x1x180x320xbf16> loc(#loc307)
+      } -> tensor<1x1x180x320xbf16> loc(#loc307)
+      xten_nn.output %461 : tensor<1x1x180x320xbf16> loc(#loc307)
+    } -> tensor<1x1x180x320xbf16> loc(#loc307)
+    %458 = xten_nn.subgraph (%arg5 = %455: tensor<1x4x180x320xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Split_444_Duplicated#0",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          External = false,
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ifm",
+          l3_extend_end = dense<[0, 4, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 4, 180, 320]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Split_444_Duplicated#0",
+      Overlay = "1x1_1x1_unspecifiedConnectivity",
+      Reason = "TemplatedGraph",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          Port = "data_io.ofm",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_extend_start = dense<0> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        }
+      ],
+      Specializes = "SliceHCWC8Adf",
+      With = {
+        config.aie_arch = "aie2p",
+        config.axis_letter = "C",
+        config.dim_c = 8 : ui32,
+        config.dim_h = 180 : ui32,
+        config.dim_w = 320 : ui32,
+        config.dtype = "bfloat16",
+        config.end = 3 : ui32,
+        config.num_ifm_shim_ch = 2 : ui32,
+        config.num_ofm_shim_ch = 2 : ui32,
+        config.start = 0 : ui32,
+        config.step = 1 : ui32
+      }} {
+      %461 = tosa.slice %arg5 {
+        PartOfLayerName = "Split_444",
+        PartOfOutputName = "Split_444",
+        size = array<i64: 1, 3, 180, 320>,
+        start = array<i64: 0, 0, 0, 0>} : (tensor<1x4x180x320xbf16>) -> tensor<1x3x180x320xbf16> loc(#loc305)
+      xten_nn.output %461 : tensor<1x3x180x320xbf16> loc(#loc305)
+    } -> tensor<1x3x180x320xbf16> loc(#loc305)
+    %459 = xten_nn.subgraph (%arg5 = %458: tensor<1x3x180x320xbf16>, %arg6 = %166: tensor<1x3x180x320xbf16>)  attributes {
+      IfmOperands = [0 : index, 1 : index],
+      LayerName = "Add_445",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        },
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Add_445",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg7 = %arg5: tensor<1x3x180x320xbf16>, %arg8 = %arg6: tensor<1x3x180x320xbf16>)  attributes {
+        LayerName = "Add_445",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm1",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+          },
+          {
+            CurrentDataFormat = "NCHW",
+            External = false,
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm2",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Add_445",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+          }
+        ],
+        Specializes = "AddBf16",
+        Traits = {
+          Binary = true,
+          Elementwise = true
+        },
+        With = {
+          config.act = 0 : ui8,
+          config.act_type = "LINEAR",
+          config.aie_arch = "aie2p",
+          config.compiler = "chess",
+          config.dtype = "bfloat16",
+          config.num_kernel_iters = 0 : ui16
+        }} {
+        %462 = tosa.add %arg7, %arg8 {LayerName = "Add_445", OutputName = "Add_445"} : (tensor<1x3x180x320xbf16>, tensor<1x3x180x320xbf16>) -> tensor<1x3x180x320xbf16> loc(#loc11)
+        xten_nn.output %462 : tensor<1x3x180x320xbf16> loc(#loc11)
+      } -> tensor<1x3x180x320xbf16> loc(#loc11)
+      xten_nn.output %461 : tensor<1x3x180x320xbf16> loc(#loc11)
+    } -> tensor<1x3x180x320xbf16> loc(#loc11)
+    %460 = xten_nn.subgraph (%arg5 = %459: tensor<1x3x180x320xbf16>)  attributes {
+      IfmOperands = [0 : index],
+      LayerName = "Clip_446",
+      Operands = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        }
+      ],
+      OutputName = "Clip_446",
+      Overlay = "4x4_1x4_vertBroadcastLeft_horizBroadcastRight",
+      Reason = "InCoreChain",
+      Results = [
+        {
+          CurrentDataFormat = "NCHW",
+          L3DataFormat = "HCWN",
+          L3Vectorization = "C:8",
+          l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+          l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+        }
+      ],
+      memory_configuration = {
+        L1 = {layout = "strict"},
+        L2 = {feature_maps_buffering = "double", layout = "flexible"}
+      }} {
+      %461 = xten_nn.subgraph (%arg6 = %arg5: tensor<1x3x180x320xbf16>)  attributes {
+        LayerName = "Clip_446",
+        Operands = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ifm",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+          }
+        ],
+        OutputName = "Clip_446",
+        Reason = "MllibKernel",
+        Results = [
+          {
+            CurrentDataFormat = "NCHW",
+            L3DataFormat = "HCWN",
+            L3Vectorization = "C:8",
+            Port = "data_io.ofm",
+            l3_extend_end = dense<[0, 5, 0, 0]> : vector<4xindex>,
+            l3_tile_count = dense<[1, 3, 180, 320]> : vector<4xindex>
+          }
+        ],
+        Specializes = "ClipBf16",
+        Traits = {
+          Elementwise = true,
+          NonNegativeOut = true,
+          Unary = true
+        },
+        With = {
+          config.aie_arch = "aie2p",
+          config.clamp_max = 1.000000e+00 : bf16,
+          config.clamp_min = 0.000000e+00 : bf16,
+          config.compiler = "chess",
+          config.ifm_shift = 0 : si8,
+          config.num_kernel_iters = 0 : ui16,
+          config.ofm_shift = 0 : si8
+        }} {
+        %462 = tosa.clamp %arg6 {
+          LayerName = "Clip_446",
+          OutputName = "Clip_446",
+          max_fp = 1.000000e+00 : f32,
+          max_int = 1 : i64,
+          min_fp = 0.000000e+00 : f32,
+          min_int = 0 : i64} : (tensor<1x3x180x320xbf16>) -> tensor<1x3x180x320xbf16> loc(#loc306)
+        xten_nn.output %462 : tensor<1x3x180x320xbf16> loc(#loc306)
+      } -> tensor<1x3x180x320xbf16> loc(#loc306)
+      xten_nn.output %461 : tensor<1x3x180x320xbf16> loc(#loc306)
+    } -> tensor<1x3x180x320xbf16> loc(#loc306)
+    return %449, %430, %410, %387, %460, %457 : tensor<1x16x90x160xbf16>, tensor<1x20x45x80xbf16>, tensor<1x40x23x40xbf16>, tensor<1x64x12x20xbf16>, tensor<1x3x180x320xbf16>, tensor<1x1x180x320xbf16> loc(#loc308)
+  } loc(#loc308)
+} loc(#loc)
+#loc1 = loc("Div_2")
+#loc2 = loc("Sub_431")
+#loc3 = loc("Sub_411")
+#loc4 = loc("Sub_385")
+#loc5 = loc("Sub_359")
+#loc6 = loc("Div_16")
+#loc7 = loc("Sub_14")
+#loc8 = loc("Initializer_398")
+#loc9 = loc("Slice_7")
+#loc10 = loc("CompilerGeneratedLoc")
+#loc11 = loc("Add_445")
+#loc12 = loc("AveragePool_346")
+#loc13 = loc("Conv_17")
+#loc14 = loc("Add_19")
+#loc15 = loc("Clip_22")
+#loc16 = loc("Div_24")
+#loc17 = loc("Mul_25")
+#loc18 = loc("Conv_26")
+#loc19 = loc("Relu_27")
+#loc20 = loc("Conv_28")
+#loc21 = loc("Add_29")
+#loc22 = loc("Conv_30")
+#loc23 = loc("Relu_31")
+#loc24 = loc("Conv_32")
+#loc25 = loc("Relu_33")
+#loc26 = loc("Conv_34")
+#loc27 = loc("Conv_35")
+#loc28 = loc("Relu_36")
+#loc29 = loc("Conv_37")
+#loc30 = loc("Relu_38")
+#loc31 = loc("Conv_39")
+#loc32 = loc("Add_40")
+#loc33 = loc("Conv_41")
+#loc34 = loc("Relu_42")
+#loc35 = loc("Conv_43")
+#loc36 = loc("Relu_44")
+#loc37 = loc("GlobalAveragePool_45")
+#loc38 = loc("Conv_46")
+#loc39 = loc("Relu_47")
+#loc40 = loc("Conv_48")
+#loc41 = loc("Add_50")
+#loc42 = loc("Clip_53")
+#loc43 = loc("Div_55")
+#loc44 = loc("Mul_56")
+#loc45 = loc("Conv_57")
+#loc46 = loc("Conv_58")
+#loc47 = loc("Relu_59")
+#loc48 = loc("Conv_60")
+#loc49 = loc("Relu_61")
+#loc50 = loc("GlobalAveragePool_62")
+#loc51 = loc("Conv_63")
+#loc52 = loc("Relu_64")
+#loc53 = loc("Conv_65")
+#loc54 = loc("Add_67")
+#loc55 = loc("Clip_70")
+#loc56 = loc("Div_72")
+#loc57 = loc("Mul_73")
+#loc58 = loc("Conv_74")
+#loc59 = loc("Add_75")
+#loc60 = loc("Conv_76")
+#loc61 = loc("Relu_77")
+#loc62 = loc("Conv_78")
+#loc63 = loc("Relu_79")
+#loc64 = loc("GlobalAveragePool_80")
+#loc65 = loc("Conv_81")
+#loc66 = loc("Relu_82")
+#loc67 = loc("Conv_83")
+#loc68 = loc("Add_85")
+#loc69 = loc("Clip_88")
+#loc70 = loc("Div_90")
+#loc71 = loc("Mul_91")
+#loc72 = loc("Conv_92")
+#loc73 = loc("Add_93")
+#loc74 = loc("Conv_94")
+#loc75 = loc("Add_96")
+#loc76 = loc("Clip_99")
+#loc77 = loc("Div_101")
+#loc78 = loc("Mul_102")
+#loc79 = loc("Conv_103")
+#loc80 = loc("Add_105")
+#loc81 = loc("Clip_108")
+#loc82 = loc("Div_110")
+#loc83 = loc("Mul_111")
+#loc84 = loc("Conv_112")
+#loc85 = loc("Conv_113")
+#loc86 = loc("Add_115")
+#loc87 = loc("Clip_118")
+#loc88 = loc("Div_120")
+#loc89 = loc("Mul_121")
+#loc90 = loc("Conv_122")
+#loc91 = loc("Add_124")
+#loc92 = loc("Clip_127")
+#loc93 = loc("Div_129")
+#loc94 = loc("Mul_130")
+#loc95 = loc("Conv_131")
+#loc96 = loc("Add_132")
+#loc97 = loc("Conv_133")
+#loc98 = loc("Add_135")
+#loc99 = loc("Clip_138")
+#loc100 = loc("Div_140")
+#loc101 = loc("Mul_141")
+#loc102 = loc("Conv_142")
+#loc103 = loc("Add_144")
+#loc104 = loc("Clip_147")
+#loc105 = loc("Div_149")
+#loc106 = loc("Mul_150")
+#loc107 = loc("Conv_151")
+#loc108 = loc("Add_152")
+#loc109 = loc("Conv_153")
+#loc110 = loc("Add_155")
+#loc111 = loc("Clip_158")
+#loc112 = loc("Div_160")
+#loc113 = loc("Mul_161")
+#loc114 = loc("Conv_162")
+#loc115 = loc("Add_164")
+#loc116 = loc("Clip_167")
+#loc117 = loc("Div_169")
+#loc118 = loc("Mul_170")
+#loc119 = loc("Conv_171")
+#loc120 = loc("Add_172")
+#loc121 = loc("Conv_173")
+#loc122 = loc("Add_175")
+#loc123 = loc("Clip_178")
+#loc124 = loc("Div_180")
+#loc125 = loc("Mul_181")
+#loc126 = loc("Conv_182")
+#loc127 = loc("Add_184")
+#loc128 = loc("Clip_187")
+#loc129 = loc("Div_189")
+#loc130 = loc("Mul_190")
+#loc131 = loc("GlobalAveragePool_191")
+#loc132 = loc("Conv_192")
+#loc133 = loc("Relu_193")
+#loc134 = loc("Conv_194")
+#loc135 = loc("Add_196")
+#loc136 = loc("Clip_199")
+#loc137 = loc("Div_201")
+#loc138 = loc("Mul_202")
+#loc139 = loc("Conv_203")
+#loc140 = loc("Conv_204")
+#loc141 = loc("Add_206")
+#loc142 = loc("Clip_209")
+#loc143 = loc("Div_211")
+#loc144 = loc("Mul_212")
+#loc145 = loc("Conv_213")
+#loc146 = loc("Add_215")
+#loc147 = loc("Clip_218")
+#loc148 = loc("Div_220")
+#loc149 = loc("Mul_221")
+#loc150 = loc("GlobalAveragePool_222")
+#loc151 = loc("Conv_223")
+#loc152 = loc("Relu_224")
+#loc153 = loc("Conv_225")
+#loc154 = loc("Add_227")
+#loc155 = loc("Clip_230")
+#loc156 = loc("Div_232")
+#loc157 = loc("Mul_233")
+#loc158 = loc("Conv_234")
+#loc159 = loc("Add_235")
+#loc160 = loc("Conv_236")
+#loc161 = loc("Add_238")
+#loc162 = loc("Clip_241")
+#loc163 = loc("Div_243")
+#loc164 = loc("Mul_244")
+#loc165 = loc("Conv_245")
+#loc166 = loc("Add_247")
+#loc167 = loc("Clip_250")
+#loc168 = loc("Div_252")
+#loc169 = loc("Mul_253")
+#loc170 = loc("GlobalAveragePool_254")
+#loc171 = loc("Conv_255")
+#loc172 = loc("Relu_256")
+#loc173 = loc("Conv_257")
+#loc174 = loc("Add_259")
+#loc175 = loc("Clip_262")
+#loc176 = loc("Div_264")
+#loc177 = loc("Mul_265")
+#loc178 = loc("Conv_266")
+#loc179 = loc("Conv_267")
+#loc180 = loc("Add_269")
+#loc181 = loc("Clip_272")
+#loc182 = loc("Div_274")
+#loc183 = loc("Mul_275")
+#loc184 = loc("Conv_276")
+#loc185 = loc("Add_278")
+#loc186 = loc("Clip_281")
+#loc187 = loc("Div_283")
+#loc188 = loc("Mul_284")
+#loc189 = loc("GlobalAveragePool_285")
+#loc190 = loc("Conv_286")
+#loc191 = loc("Relu_287")
+#loc192 = loc("Conv_288")
+#loc193 = loc("Add_290")
+#loc194 = loc("Clip_293")
+#loc195 = loc("Div_295")
+#loc196 = loc("Mul_296")
+#loc197 = loc("Conv_297")
+#loc198 = loc("Add_298")
+#loc199 = loc("Conv_299")
+#loc200 = loc("Add_301")
+#loc201 = loc("Clip_304")
+#loc202 = loc("Div_306")
+#loc203 = loc("Mul_307")
+#loc204 = loc("Conv_308")
+#loc205 = loc("Add_310")
+#loc206 = loc("Clip_313")
+#loc207 = loc("Div_315")
+#loc208 = loc("Mul_316")
+#loc209 = loc("GlobalAveragePool_317")
+#loc210 = loc("Conv_318")
+#loc211 = loc("Relu_319")
+#loc212 = loc("Conv_320")
+#loc213 = loc("Add_322")
+#loc214 = loc("Clip_325")
+#loc215 = loc("Div_327")
+#loc216 = loc("Mul_328")
+#loc217 = loc("Conv_329")
+#loc218 = loc("Add_330")
+#loc219 = loc("Conv_331")
+#loc220 = loc("Add_333")
+#loc221 = loc("Clip_336")
+#loc222 = loc("Div_338")
+#loc223 = loc("Mul_339")
+#loc224 = loc("GlobalAveragePool_342")
+#loc225 = loc("Conv_343")
+#loc226 = loc("Sigmoid_344")
+#loc227 = loc("Mul_345")
+#loc228 = loc("Conv_340")
+#loc229 = loc("Relu_341")
+#loc230 = loc("Split_349")
+#loc231 = loc("Concat_350")
+#loc232 = loc("Conv_351")
+#loc233 = loc("Sigmoid_352")
+#loc234 = loc("Split_353")
+#loc235 = loc("Mul_354")
+#loc236 = loc("Concat_355")
+#loc237 = loc("Conv_356")
+#loc238 = loc("Tanh_357")
+#loc239 = loc("Mul_361")
+#loc240 = loc("Mul_360")
+#loc241 = loc("Add_362")
+#loc242 = loc("Concat_363")
+#loc243 = loc("Resize_365")
+#loc244 = loc("Slice_371")
+#loc245 = loc("AveragePool_347")
+#loc246 = loc("AveragePool_348")
+#loc247 = loc("Concat_372")
+#loc248 = loc("Conv_373")
+#loc249 = loc("Relu_374")
+#loc250 = loc("Split_375")
+#loc251 = loc("Concat_376")
+#loc252 = loc("Conv_377")
+#loc253 = loc("Sigmoid_378")
+#loc254 = loc("Split_379")
+#loc255 = loc("Mul_380")
+#loc256 = loc("Concat_381")
+#loc257 = loc("Conv_382")
+#loc258 = loc("Tanh_383")
+#loc259 = loc("Mul_387")
+#loc260 = loc("Mul_386")
+#loc261 = loc("Add_388")
+#loc262 = loc("Concat_389")
+#loc263 = loc("Resize_391")
+#loc264 = loc("Slice_397")
+#loc265 = loc("Concat_398")
+#loc266 = loc("Conv_399")
+#loc267 = loc("Relu_400")
+#loc268 = loc("Split_401")
+#loc269 = loc("Concat_402")
+#loc270 = loc("Conv_403")
+#loc271 = loc("Sigmoid_404")
+#loc272 = loc("Split_405")
+#loc273 = loc("Mul_406")
+#loc274 = loc("Concat_407")
+#loc275 = loc("Conv_408")
+#loc276 = loc("Tanh_409")
+#loc277 = loc("Mul_413")
+#loc278 = loc("Mul_412")
+#loc279 = loc("Add_414")
+#loc280 = loc("Concat_415")
+#loc281 = loc("Resize_417")
+#loc282 = loc("Concat_418")
+#loc283 = loc("Conv_419")
+#loc284 = loc("Relu_420")
+#loc285 = loc("Split_421")
+#loc286 = loc("Concat_422")
+#loc287 = loc("Conv_423")
+#loc288 = loc("Sigmoid_424")
+#loc289 = loc("Split_425")
+#loc290 = loc("Mul_426")
+#loc291 = loc("Concat_427")
+#loc292 = loc("Conv_428")
+#loc293 = loc("Tanh_429")
+#loc294 = loc("Mul_433")
+#loc295 = loc("Mul_432")
+#loc296 = loc("Add_434")
+#loc297 = loc("Concat_435")
+#loc298 = loc("Resize_437")
+#loc299 = loc("Concat_438")
+#loc300 = loc("Conv_439")
+#loc301 = loc("Relu_440")
+#loc302 = loc("Conv_441")
+#loc303 = loc("Relu_442")
+#loc304 = loc("Conv_443")
+#loc305 = loc("Split_444")
+#loc306 = loc("Clip_446")
+#loc307 = loc("Clip_447")
+#loc308 = loc(fused[#loc1, #loc2, #loc3, #loc4, #loc5, #loc6, #loc7, #loc8, #loc9, #loc10, #loc11, #loc12, #loc13, #loc14, #loc15, #loc16, #loc17, #loc18, #loc19, #loc20, #loc21, #loc22, #loc23, #loc24, #loc25, #loc26, #loc27, #loc28, #loc29, #loc30, #loc31, #loc32, #loc33, #loc34, #loc35, #loc36, #loc37, #loc38, #loc39, #loc40, #loc41, #loc42, #loc43, #loc44, #loc45, #loc46, #loc47, #loc48, #loc49, #loc50, #loc51, #loc52, #loc53, #loc54, #loc55, #loc56, #loc57, #loc58, #loc59, #loc60, #loc61, #loc62, #loc63, #loc64, #loc65, #loc66, #loc67, #loc68, #loc69, #loc70, #loc71, #loc72, #loc73, #loc74, #loc75, #loc76, #loc77, #loc78, #loc79, #loc80, #loc81, #loc82, #loc83, #loc84, #loc85, #loc86, #loc87, #loc88, #loc89, #loc90, #loc91, #loc92, #loc93, #loc94, #loc95, #loc96, #loc97, #loc98, #loc99, #loc100, #loc101, #loc102, #loc103, #loc104, #loc105, #loc106, #loc107, #loc108, #loc109, #loc110, #loc111, #loc112, #loc113, #loc114, #loc115, #loc116, #loc117, #loc118, #loc119, #loc120, #loc121, #loc122, #loc123, #loc124, #loc125, #loc126, #loc127, #loc128, #loc129, #loc130, #loc131, #loc132, #loc133, #loc134, #loc135, #loc136, #loc137, #loc138, #loc139, #loc140, #loc141, #loc142, #loc143, #loc144, #loc145, #loc146, #loc147, #loc148, #loc149, #loc150, #loc151, #loc152, #loc153, #loc154, #loc155, #loc156, #loc157, #loc158, #loc159, #loc160, #loc161, #loc162, #loc163, #loc164, #loc165, #loc166, #loc167, #loc168, #loc169, #loc170, #loc171, #loc172, #loc173, #loc174, #loc175, #loc176, #loc177, #loc178, #loc179, #loc180, #loc181, #loc182, #loc183, #loc184, #loc185, #loc186, #loc187, #loc188, #loc189, #loc190, #loc191, #loc192, #loc193, #loc194, #loc195, #loc196, #loc197, #loc198, #loc199, #loc200, #loc201, #loc202, #loc203, #loc204, #loc205, #loc206, #loc207, #loc208, #loc209, #loc210, #loc211, #loc212, #loc213, #loc214, #loc215, #loc216, #loc217, #loc218, #loc219, #loc220, #loc221, #loc222, #loc223, #loc224, #loc225, #loc226, #loc227, #loc228, #loc229, #loc230, #loc231, #loc232, #loc233, #loc234, #loc235, #loc236, #loc237, #loc238, #loc239, #loc240, #loc241, #loc242, #loc243, #loc244, #loc245, #loc246, #loc247, #loc248, #loc249, #loc250, #loc251, #loc252, #loc253, #loc254, #loc255, #loc256, #loc257, #loc258, #loc259, #loc260, #loc261, #loc262, #loc263, #loc264, #loc265, #loc266, #loc267, #loc268, #loc269, #loc270, #loc271, #loc272, #loc273, #loc274, #loc275, #loc276, #loc277, #loc278, #loc279, #loc280, #loc281, #loc282, #loc283, #loc284, #loc285, #loc286, #loc287, #loc288, #loc289, #loc290, #loc291, #loc292, #loc293, #loc294, #loc295, #loc296, #loc297, #loc298, #loc299, #loc300, #loc301, #loc302, #loc303, #loc304, #loc305, #loc306, #loc307])
+#loc309 = loc(fused[#loc7, #loc8])
+#loc310 = loc(fused[#loc11, #loc9, #loc12])
+#loc311 = loc(fused[#loc9, #loc12, #loc11])
+#loc312 = loc(fused[#loc18, #loc19])
+#loc313 = loc(fused[#loc20, #loc21])
+#loc314 = loc(fused[#loc22, #loc23])
+#loc315 = loc(fused[#loc24, #loc25])
+#loc316 = loc(fused[#loc27, #loc28])
+#loc317 = loc(fused[#loc29, #loc30])
+#loc318 = loc(fused[#loc31, #loc32])
+#loc319 = loc(fused[#loc33, #loc34])
+#loc320 = loc(fused[#loc35, #loc36])
+#loc321 = loc(fused[#loc38, #loc39])
+#loc322 = loc(fused[#loc46, #loc47])
+#loc323 = loc(fused[#loc48, #loc49])
+#loc324 = loc(fused[#loc51, #loc52])
+#loc325 = loc(fused[#loc58, #loc59])
+#loc326 = loc(fused[#loc60, #loc61])
+#loc327 = loc(fused[#loc62, #loc63])
+#loc328 = loc(fused[#loc65, #loc66])
+#loc329 = loc(fused[#loc72, #loc73])
+#loc330 = loc(fused[#loc95, #loc96])
+#loc331 = loc(fused[#loc107, #loc108])
+#loc332 = loc(fused[#loc119, #loc120])
+#loc333 = loc(fused[#loc130, #loc131])
+#loc334 = loc(fused[#loc132, #loc133])
+#loc335 = loc(fused[#loc149, #loc150])
+#loc336 = loc(fused[#loc151, #loc152])
+#loc337 = loc(fused[#loc158, #loc159])
+#loc338 = loc(fused[#loc169, #loc170])
+#loc339 = loc(fused[#loc171, #loc172])
+#loc340 = loc(fused[#loc188, #loc189])
+#loc341 = loc(fused[#loc190, #loc191])
+#loc342 = loc(fused[#loc197, #loc198])
+#loc343 = loc(fused[#loc208, #loc209])
+#loc344 = loc(fused[#loc210, #loc211])
+#loc345 = loc(fused[#loc217, #loc218])
+#loc346 = loc(fused[#loc223, #loc224])
+#loc347 = loc(fused[#loc228, #loc229, #loc227])
+#loc348 = loc(fused[#loc228, #loc229])
+#loc349 = loc(fused[#loc248, #loc249])
+#loc350 = loc(fused[#loc266, #loc267])
+#loc351 = loc(fused[#loc283, #loc284])
+#loc352 = loc(fused[#loc300, #loc301])
+#loc353 = loc(fused[#loc302, #loc303])