PaddlePaddle · zh794390558 · Mar 15, 2023 · Dec 20, 2022 · Dec 20, 2022 · Dec 20, 2022
diff --git a/examples/aishell/asr1/conf/chunk_squeezeformer.yaml b/examples/aishell/asr1/conf/chunk_squeezeformer.yaml
@@ -0,0 +1,98 @@
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: squeezeformer
+encoder_conf:
+    encoder_dim: 256    # dimension of attention
+    output_size: 256    # dimension of output
+    attention_heads: 4
+    num_blocks: 12      # the number of encoder blocks
+    reduce_idx: 5
+    recover_idx: 11
+    feed_forward_expansion_factor: 8
+    input_dropout_rate: 0.1
+    feed_forward_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    adaptive_scale: true
+    cnn_module_kernel: 31
+    normalize_before: false
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    time_reduction_layer_type: 'stream'
+    causal: true
+    use_dynamic_chunk: true
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1  # sublayer output dropout
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    init_type: 'kaiming_uniform' # !Warning: need to convergence
+
+###########################################
+#                   Data                  #
+###########################################
+
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+
+
+###########################################
+#              Dataloader                 #
+###########################################
+
+vocab_filepath: data/lang_char/vocab.txt 
+spm_model_prefix: ''
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 32
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 2
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 240 
+accum_grad: 1
+global_grad_clip: 5.0
+dist_sampler: True
+optim: adam
+optim_conf:
+  lr: 0.001
+  weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/aishell/asr1/conf/squeezeformer.yaml b/examples/aishell/asr1/conf/squeezeformer.yaml
@@ -0,0 +1,93 @@
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: squeezeformer
+encoder_conf:
+    encoder_dim: 256    # dimension of attention
+    output_size: 256    # dimension of output
+    attention_heads: 4
+    num_blocks: 12      # the number of encoder blocks
+    reduce_idx: 5
+    recover_idx: 11
+    feed_forward_expansion_factor: 8
+    input_dropout_rate: 0.1
+    feed_forward_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    adaptive_scale: true
+    cnn_module_kernel: 31
+    normalize_before: false
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    time_reduction_layer_type: 'conv1d'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    init_type: 'kaiming_uniform' # !Warning: need to convergence
+
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+spm_model_prefix: ''
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 32
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 2
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 150 
+accum_grad: 8
+global_grad_clip: 5.0
+dist_sampler: False
+optim: adam
+optim_conf:
+  lr: 0.002
+  weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
@@ -43,6 +43,7 @@
 from paddlespeech.s2t.modules.decoder import BiTransformerDecoder
 from paddlespeech.s2t.modules.decoder import TransformerDecoder
 from paddlespeech.s2t.modules.encoder import ConformerEncoder
+from paddlespeech.s2t.modules.encoder import SqueezeformerEncoder
 from paddlespeech.s2t.modules.encoder import TransformerEncoder
 from paddlespeech.s2t.modules.initializer import DefaultInitializerContext
 from paddlespeech.s2t.modules.loss import LabelSmoothingLoss
@@ -905,6 +906,9 @@ def _init_from_config(cls, configs: dict):
         elif encoder_type == 'conformer':
             encoder = ConformerEncoder(
                 input_dim, global_cmvn=global_cmvn, **configs['encoder_conf'])
+        elif encoder_type == 'squeezeformer':
+            encoder = SqueezeformerEncoder(
+                input_dim, global_cmvn=global_cmvn, **configs['encoder_conf'])
         else:
             raise ValueError(f"not support encoder type:{encoder_type}")
 

diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py
@@ -200,7 +200,12 @@ def forward(self,
 class RelPositionMultiHeadedAttention(MultiHeadedAttention):
     """Multi-Head Attention layer with relative position encoding."""
 
-    def __init__(self, n_head, n_feat, dropout_rate):
+    def __init__(self,
+                 n_head,
+                 n_feat,
+                 dropout_rate,
+                 adaptive_scale=False,
+                 init_weights=False):
         """Construct an RelPositionMultiHeadedAttention object.
         Paper: https://arxiv.org/abs/1901.02860
         Args:
@@ -223,6 +228,39 @@ def __init__(self, n_head, n_feat, dropout_rate):
         pos_bias_v = self.create_parameter(
             (self.h, self.d_k), default_initializer=I.XavierUniform())
         self.add_parameter('pos_bias_v', pos_bias_v)
+        self.adaptive_scale = adaptive_scale
+        if self.adaptive_scale:
+            ada_scale = self.create_parameter(
+                [1, 1, n_feat], default_initializer=I.Constant(1.0))
+            self.add_parameter('ada_scale', ada_scale)
+            ada_bias = self.create_parameter(
+                [1, 1, n_feat], default_initializer=I.Constant(0.0))
+            self.add_parameter('ada_bias', ada_bias)
+        if init_weights:
+            self.init_weights()
+
+    def init_weights(self):
+        input_max = (self.h * self.d_k)**-0.5
+        self.linear_q._param_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_q._bias_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_k._param_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_k._bias_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_v._param_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_v._bias_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_pos._param_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_pos._bias_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_out._param_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_out._bias_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
 
     def rel_shift(self, x, zero_triu: bool=False):
         """Compute relative positinal encoding.
@@ -273,6 +311,11 @@ def forward(self,
                 where `cache_t == chunk_size * num_decoding_left_chunks`
                 and `head * d_k == size`
         """
+        if self.adaptive_scale:
+            query = self.ada_scale * query + self.ada_bias
+            key = self.ada_scale * key + self.ada_bias
+            value = self.ada_scale * value + self.ada_bias
+
         q, k, v = self.forward_qkv(query, key, value)
         # q = q.transpose([0, 2, 1, 3])  # (batch, time1, head, d_k)
 

diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py
@@ -18,6 +18,7 @@
 
 import paddle
 from paddle import nn
+from paddle.nn import initializer as I
 from typeguard import check_argument_types
 
 from paddlespeech.s2t.modules.align import BatchNorm1D
@@ -39,7 +40,9 @@ def __init__(self,
                  activation: nn.Layer=nn.ReLU(),
                  norm: str="batch_norm",
                  causal: bool=False,
-                 bias: bool=True):
+                 bias: bool=True,
+                 adaptive_scale: bool=False,
+                 init_weights: bool=False):
         """Construct an ConvolutionModule object.
         Args:
             channels (int): The number of channels of conv layers.
@@ -51,6 +54,18 @@ def __init__(self,
         """
         assert check_argument_types()
         super().__init__()
+        self.bias = bias
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.adaptive_scale = adaptive_scale
+        if self.adaptive_scale:
+            ada_scale = self.create_parameter(
+                [1, 1, channels], default_initializer=I.Constant(1.0))
+            self.add_parameter('ada_scale', ada_scale)
+            ada_bias = self.create_parameter(
+                [1, 1, channels], default_initializer=I.Constant(0.0))
+            self.add_parameter('ada_bias', ada_bias)
+
         self.pointwise_conv1 = Conv1D(
             channels,
             2 * channels,
@@ -105,6 +120,28 @@ def __init__(self,
         )
         self.activation = activation
 
+        if init_weights:
+            self.init_weights()
+
+    def init_weights(self):
+        pw_max = self.channels**-0.5
+        dw_max = self.kernel_size**-0.5
+        self.pointwise_conv1._param_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+        if self.bias:
+            self.pointwise_conv1._bias_attr = paddle.nn.initializer.Uniform(
+                low=-pw_max, high=pw_max)
+        self.depthwise_conv._param_attr = paddle.nn.initializer.Uniform(
+            low=-dw_max, high=dw_max)
+        if self.bias:
+            self.depthwise_conv._bias_attr = paddle.nn.initializer.Uniform(
+                low=-dw_max, high=dw_max)
+        self.pointwise_conv2._param_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+        if self.bias:
+            self.pointwise_conv2._bias_attr = paddle.nn.initializer.Uniform(
+                low=-pw_max, high=pw_max)
+
     def forward(
             self,
             x: paddle.Tensor,
@@ -123,6 +160,9 @@ def forward(
             paddle.Tensor: Output tensor (#batch, time, channels).
             paddle.Tensor: Output cache tensor (#batch, channels, time')
         """
+        if self.adaptive_scale:
+            x = self.ada_scale * x + self.ada_bias
+
         # exchange the temporal dimension and the feature dimension
         x = x.transpose([0, 2, 1])  # [B, C, T]