Operator: aten._softmax.default
cnt: 6, ((T([256, 8, 33, 33], f16), -1, False), {})
cnt: 6, ((T([256, 8, 31, 31], f16), -1, False), {})
cnt: 6, ((T([256, 8, 31, 33], f16), -1, False), {})
Operator: aten._softmax_backward_data.default
cnt: 6, ((T([256, 8, 31, 33], f16), T([256, 8, 31, 33], f16), -1, f16), {})
cnt: 6, ((T([256, 8, 31, 31], f16), T([256, 8, 31, 31], f16), -1, f16), {})
cnt: 6, ((T([256, 8, 33, 33], f16), T([256, 8, 33, 33], f16), -1, f16), {})
Operator: aten._to_copy.default
cnt: 1, ((T([1, 31, 31], f32),), {'dtype': torch.bool})
Operator: aten._unsafe_view.default
cnt: 36, ((T([8448, 512], f16), [256, 33, 512]), {})
cnt: 24, ((T([256, 8, 33, 64], f16), [2048, 33, 64]), {})
cnt: 12, ((T([256, 8, 64, 33], f16), [2048, 64, 33]), {})
cnt: 6, ((T([2048, 33, 33], f16), [256, 8, 33, 33]), {})
cnt: 6, ((T([2048, 33, 64], f16), [256, 8, 33, 64]), {})
cnt: 36, ((T([7936, 512], f16), [256, 31, 512]), {})
cnt: 30, ((T([256, 8, 31, 64], f16), [2048, 31, 64]), {})
cnt: 6, ((T([256, 8, 64, 31], f16), [2048, 64, 31]), {})
cnt: 6, ((T([2048, 31, 31], f16), [256, 8, 31, 31]), {})
cnt: 12, ((T([2048, 31, 64], f16), [256, 8, 31, 64]), {})
cnt: 6, ((T([2048, 31, 33], f16), [256, 8, 31, 33]), {})
cnt: 1, ((T([7936, 9521], f16), [256, 31, 9521]), {})
cnt: 18, ((T([256, 33, 8, 64], f16), [256, 33, 512]), {})
cnt: 12, ((T([256, 33, 512], f16), [8448, 512]), {})
cnt: 18, ((T([256, 31, 8, 64], f16), [256, 31, 512]), {})
cnt: 6, ((T([256, 31, 512], f16), [7936, 512]), {})
Operator: aten.add.Tensor
cnt: 1, ((T([256, 33, 512], f16), T([1, 33, 512], f16)), {})
cnt: 1, ((T([256, 31, 512], f16), T([1, 31, 512], f16)), {})
cnt: 30, ((T([256, 31, 512], f16), T([256, 31, 512], f16)), {})
cnt: 35, ((T([256, 33, 512], f16), T([256, 33, 512], f16)), {})
Operator: aten.add_.Tensor
cnt: 12, ((T([256, 33, 512], f16), T([256, 33, 512], f16)), {})
cnt: 18, ((T([256, 31, 512], f16), T([256, 31, 512], f16)), {})
Operator: aten.addmm.default
cnt: 6, ((T([2048], f16), T([8448, 512], f16), T([512, 2048], f16, stride=(1, 512))), {})
cnt: 6, ((T([512], f16), T([8448, 2048], f16), T([2048, 512], f16, stride=(1, 2048))), {})
cnt: 6, ((T([2048], f16), T([7936, 512], f16), T([512, 2048], f16, stride=(1, 512))), {})
cnt: 6, ((T([512], f16), T([7936, 2048], f16), T([2048, 512], f16, stride=(1, 2048))), {})
Operator: aten.bitwise_and.Tensor
cnt: 1, ((T([256, 1, 31], b8, stride=(1, 7936, 256)), T([1, 31, 31], b8)), {})
Operator: aten.bmm.default
cnt: 6, ((T([2048, 33, 64], f16), T([2048, 64, 33], f16)), {})
cnt: 6, ((T([2048, 33, 33], f16), T([2048, 33, 64], f16)), {})
cnt: 6, ((T([2048, 31, 64], f16), T([2048, 64, 31], f16)), {})
cnt: 6, ((T([2048, 31, 31], f16), T([2048, 31, 64], f16)), {})
cnt: 6, ((T([2048, 31, 64], f16), T([2048, 64, 33], f16)), {})
cnt: 6, ((T([2048, 31, 33], f16), T([2048, 33, 64], f16)), {})
cnt: 6, ((T([2048, 33, 31], f16, stride=(1023, 1, 33)), T([2048, 31, 64], f16)), {})
cnt: 6, ((T([2048, 31, 64], f16), T([2048, 64, 33], f16, stride=(2112, 1, 64))), {})
cnt: 6, ((T([2048, 64, 31], f16, stride=(1984, 1, 64)), T([2048, 31, 33], f16)), {})
cnt: 6, ((T([2048, 31, 33], f16), T([2048, 33, 64], f16, stride=(2112, 1, 33))), {})
cnt: 6, ((T([2048, 31, 31], f16, stride=(961, 1, 31)), T([2048, 31, 64], f16)), {})
cnt: 6, ((T([2048, 31, 64], f16), T([2048, 64, 31], f16, stride=(1984, 1, 64))), {})
cnt: 6, ((T([2048, 64, 31], f16, stride=(1984, 1, 64)), T([2048, 31, 31], f16)), {})
cnt: 6, ((T([2048, 31, 31], f16), T([2048, 31, 64], f16, stride=(1984, 1, 31))), {})
cnt: 6, ((T([2048, 33, 33], f16, stride=(1089, 1, 33)), T([2048, 33, 64], f16)), {})
cnt: 6, ((T([2048, 33, 64], f16), T([2048, 64, 33], f16, stride=(2112, 1, 64))), {})
cnt: 6, ((T([2048, 64, 33], f16, stride=(2112, 1, 64)), T([2048, 33, 33], f16)), {})
cnt: 6, ((T([2048, 33, 33], f16), T([2048, 33, 64], f16, stride=(2112, 1, 33))), {})
Operator: aten.clone.default
cnt: 1, ((T([256, 33], i64, stride=(1, 256)),), {})
cnt: 1, ((T([256, 31], i64, stride=(1, 256)),), {})
cnt: 1, ((T([1, 33, 512], f16),), {})
cnt: 1, ((T([1, 31, 512], f16),), {})
Operator: aten.copy_.default
cnt: 1, ((T([256, 33], i64, stride=(1, 256)), T([256, 33], i64, stride=(1, 256))), {})
cnt: 1, ((T([256, 31], i64, stride=(1, 256)), T([256, 31], i64, stride=(1, 256))), {})
cnt: 12, ((T([256, 31, 512], f16), T([256, 31, 512], f16)), {})
cnt: 6, ((T([7936, 512], f16), T([7936, 512], f16)), {})
cnt: 12, ((T([256, 33, 512], f16), T([256, 33, 512], f16)), {})
cnt: 6, ((T([8448, 512], f16), T([8448, 512], f16)), {})
Operator: aten.div.Tensor
cnt: 6, ((T([256, 8, 33, 64], f16, stride=(16896, 64, 512, 1)), 8.0), {})
cnt: 12, ((T([256, 8, 31, 64], f16, stride=(15872, 64, 512, 1)), 8.0), {})
cnt: 2, ((T([], f16), 75558656), {})
cnt: 12, ((T([256, 8, 31, 64], f16), 8.0), {})
cnt: 6, ((T([256, 8, 33, 64], f16), 8.0), {})
Operator: aten.embedding.default
cnt: 1, ((T([9521, 512], f16), T([256, 33], i64, stride=(1, 256)), 1), {})
cnt: 1, ((T([9521, 512], f16), T([256, 31], i64, stride=(1, 256)), 1), {})
Operator: aten.embedding_dense_backward.default
cnt: 1, ((T([256, 31, 512], f16), T([256, 31], i64, stride=(1, 256)), 9521, 1, False), {})
cnt: 1, ((T([256, 33, 512], f16), T([256, 33], i64, stride=(1, 256)), 9521, 1, False), {})
Operator: aten.eq.Scalar
cnt: 12, ((T([256, 1, 1, 33], b8, stride=(1, 8448, 8448, 256)), 0), {})
cnt: 6, ((T([256, 1, 31, 31], b8, stride=(1, 7936, 256, 7936)), 0), {})
Operator: aten.masked_fill.Scalar
cnt: 6, ((T([256, 8, 33, 33], f16), T([256, 1, 1, 33], b8, stride=(1, 8448, 8448, 256)), -65504.0), {})
cnt: 6, ((T([256, 8, 31, 31], f16), T([256, 1, 31, 31], b8, stride=(1, 7936, 256, 7936)), -65504.0), {})
cnt: 6, ((T([256, 8, 31, 33], f16), T([256, 1, 1, 33], b8, stride=(1, 8448, 8448, 256)), -65504.0), {})
cnt: 6, ((T([256, 8, 31, 33], f16), T([256, 1, 1, 33], b8, stride=(1, 8448, 8448, 256)), 0), {})
cnt: 6, ((T([256, 8, 31, 31], f16), T([256, 1, 31, 31], b8, stride=(1, 7936, 256, 7936)), 0), {})
cnt: 6, ((T([256, 8, 33, 33], f16), T([256, 1, 1, 33], b8, stride=(1, 8448, 8448, 256)), 0), {})
Operator: aten.mm.default
cnt: 36, ((T([8448, 512], f16), T([512, 512], f16, stride=(1, 512))), {})
cnt: 36, ((T([7936, 512], f16), T([512, 512], f16, stride=(1, 512))), {})
cnt: 1, ((T([7936, 512], f16), T([512, 9521], f16, stride=(1, 512))), {})
cnt: 1, ((T([9521, 7936], f16, stride=(1, 9521)), T([7936, 512], f16)), {})
cnt: 1, ((T([7936, 9521], f16), T([9521, 512], f16)), {})
cnt: 6, ((T([7936, 512], f16), T([512, 2048], f16)), {})
cnt: 6, ((T([512, 7936], f16, stride=(1, 512)), T([7936, 2048], f16)), {})
cnt: 6, ((T([7936, 2048], f16), T([2048, 512], f16)), {})
cnt: 6, ((T([2048, 7936], f16, stride=(1, 2048)), T([7936, 512], f16)), {})
cnt: 36, ((T([512, 7936], f16, stride=(1, 512)), T([7936, 512], f16)), {})
cnt: 36, ((T([7936, 512], f16), T([512, 512], f16)), {})
cnt: 36, ((T([512, 8448], f16, stride=(1, 512)), T([8448, 512], f16)), {})
cnt: 36, ((T([8448, 512], f16), T([512, 512], f16)), {})
cnt: 6, ((T([8448, 512], f16), T([512, 2048], f16)), {})
cnt: 6, ((T([512, 8448], f16, stride=(1, 512)), T([8448, 2048], f16)), {})
cnt: 6, ((T([8448, 2048], f16), T([2048, 512], f16)), {})
cnt: 6, ((T([2048, 8448], f16, stride=(1, 2048)), T([8448, 512], f16)), {})
Operator: aten.mul.Tensor
cnt: 1, ((T([256, 31, 9521], f16), 1.0), {})
cnt: 1, ((T([256, 31, 9521], f16, stride=(0, 0, 0)), 1.0), {})
Operator: aten.native_layer_norm.default
cnt: 13, ((T([256, 33, 512], f16), [512], T([512], f16), T([512], f16), 1e-06), {})
cnt: 19, ((T([256, 31, 512], f16), [512], T([512], f16), T([512], f16), 1e-06), {})
Operator: aten.native_layer_norm_backward.default
cnt: 19, ((T([256, 31, 512], f16), T([256, 31, 512], f16), [512], T([256, 31, 1], f32), T([256, 31, 1], f32), T([512], f16), T([512], f16), [True, True, True]), {})
cnt: 13, ((T([256, 33, 512], f16), T([256, 33, 512], f16), [512], T([256, 33, 1], f32), T([256, 33, 1], f32), T([512], f16), T([512], f16), [True, True, True]), {})
Operator: aten.ne.Scalar
cnt: 1, ((T([256, 33], i64, stride=(1, 256)), 1), {})
cnt: 1, ((T([256, 31], i64, stride=(1, 256)), 1), {})
Operator: aten.new_empty_strided.default
cnt: 6, ((T([7936, 512], f16), [7936, 512], [512, 1]), {})
cnt: 6, ((T([8448, 512], f16), [8448, 512], [512, 1]), {})
Operator: aten.new_zeros.default
cnt: 6, ((T([256, 31, 512], f16), [4063232]), {})
cnt: 6, ((T([256, 33, 512], f16), [4325376]), {})
Operator: aten.relu.default
cnt: 6, ((T([256, 33, 2048], f16),), {})
cnt: 6, ((T([256, 31, 2048], f16),), {})
Operator: aten.rsub.Scalar
cnt: 1, ((T([1, 31, 31], f32), 1), {})
Operator: aten.sum.SymInt
cnt: 6, ((T([7936, 512], f16), [0], True), {})
cnt: 6, ((T([7936, 2048], f16), [0], True), {})
cnt: 6, ((T([8448, 512], f16), [0], True), {})
cnt: 6, ((T([8448, 2048], f16), [0], True), {})
Operator: aten.sum.default
cnt: 1, ((T([7936, 9521], f16),), {})
Operator: aten.threshold_backward.default
cnt: 6, ((T([256, 31, 2048], f16), T([256, 31, 2048], f16), 0), {})
cnt: 6, ((T([256, 33, 2048], f16), T([256, 33, 2048], f16), 0), {})
Operator: aten.triu.default
cnt: 1, ((T([1, 31, 31], f32), 1), {})
