PaddlePaddle · yt605155624 · Mar 13, 2023 · Aug 26, 2022 · Aug 29, 2022 · Sep 6, 2022
diff --git a/examples/opencpop/svs1/conf/default.yaml b/examples/opencpop/svs1/conf/default.yaml
@@ -0,0 +1,156 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 512         # FFT size (samples).
+n_shift: 128       # Hop size (samples). 12.5ms
+win_length: 512    # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 30           # Minimum frequency of Mel basis.
+fmax: 12000        # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+# Only used for the model using pitch features (e.g. FastSpeech2)
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 750         # Maximum f0 for pitch extraction.
+
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 48     # batch size
+num_workers: 1     # number of gpu
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    # music score related
+    note_num: 300                                     # number of note
+    is_slur_num: 2                                    # number of slur
+
+    # fastspeech2 module
+    fastspeech2_params:
+        adim: 256                                     # attention dimension
+        aheads: 2                                     # number of attention heads
+        elayers: 4                                    # number of encoder layers
+        eunits: 1024                                  # number of encoder ff units
+        dlayers: 4                                    # number of decoder layers
+        dunits: 1024                                  # number of decoder ff units
+        positionwise_layer_type: conv1d-linear        # type of position-wise layer
+        positionwise_conv_kernel_size: 9              # kernel size of position wise conv layer
+        transformer_enc_dropout_rate: 0.1             # dropout rate for transformer encoder layer
+        transformer_enc_positional_dropout_rate: 0.1  # dropout rate for transformer encoder positional encoding
+        transformer_enc_attn_dropout_rate: 0.0        # dropout rate for transformer encoder attention layer
+        transformer_activation_type: "gelu"           # Activation function type in transformer.
+        encoder_normalize_before: True                # whether to perform layer normalization before the input
+        decoder_normalize_before: True                # whether to perform layer normalization before the input
+        reduction_factor: 1                           # reduction factor
+        init_type: xavier_uniform                     # initialization type
+        init_enc_alpha: 1.0                           # initial value of alpha of encoder scaled position encoding
+        init_dec_alpha: 1.0                           # initial value of alpha of decoder scaled position encoding
+        use_scaled_pos_enc: True                      # whether to use scaled positional encoding
+        transformer_dec_dropout_rate: 0.1             # dropout rate for transformer decoder layer
+        transformer_dec_positional_dropout_rate: 0.1  # dropout rate for transformer decoder positional encoding
+        transformer_dec_attn_dropout_rate: 0.0        # dropout rate for transformer decoder attention layer
+        duration_predictor_layers: 5                  # number of layers of duration predictor
+        duration_predictor_chans: 256                 # number of channels of duration predictor
+        duration_predictor_kernel_size: 3             # filter size of duration predictor
+        duration_predictor_dropout_rate: 0.5          # dropout rate in energy predictor
+        pitch_predictor_layers: 5                     # number of conv layers in pitch predictor
+        pitch_predictor_chans: 256                    # number of channels of conv layers in pitch predictor
+        pitch_predictor_kernel_size: 5                # kernel size of conv leyers in pitch predictor
+        pitch_predictor_dropout: 0.5                  # dropout rate in pitch predictor
+        pitch_embed_kernel_size: 1                    # kernel size of conv embedding layer for pitch
+        pitch_embed_dropout: 0.0                      # dropout rate after conv embedding layer for pitch
+        stop_gradient_from_pitch_predictor: True      # whether to stop the gradient from pitch predictor to encoder
+        energy_predictor_layers: 2                    # number of conv layers in energy predictor
+        energy_predictor_chans: 256                   # number of channels of conv layers in energy predictor
+        energy_predictor_kernel_size: 3               # kernel size of conv leyers in energy predictor
+        energy_predictor_dropout: 0.5                 # dropout rate in energy predictor
+        energy_embed_kernel_size: 1                   # kernel size of conv embedding layer for energy
+        energy_embed_dropout: 0.0                     # dropout rate after conv embedding layer for energy
+        stop_gradient_from_energy_predictor: False    # whether to stop the gradient from energy predictor to encoder
+        postnet_layers: 5                             # number of layers of postnet
+        postnet_filts: 5                              # filter size of conv layers in postnet
+        postnet_chans: 256                            # number of channels of conv layers in postnet
+        postnet_dropout_rate: 0.5                     # dropout rate for postnet
+
+    # denoiser module
+    denoiser_params:
+        in_channels: 80                               # Number of channels of the input mel-spectrogram
+        out_channels: 80                              # Number of channels of the output mel-spectrogram
+        kernel_size: 3                                # Kernel size of the residual blocks inside                           
+        layers: 20                                    # Number of residual blocks inside
+        stacks: 5                                     # The number of groups to split the residual blocks into
+        residual_channels: 256                        # Residual channel of the residual blocks
+        gate_channels: 512                            # Gate channel of the residual blocks
+        skip_channels: 256                            # Skip channel of the residual blocks
+        aux_channels: 256                             # Auxiliary channel of the residual blocks
+        dropout: 0.1                                  # Dropout of the residual blocks
+        bias: True                                    # Whether to use bias in residual blocks
+        use_weight_norm: False                        # Whether to use weight norm in all convolutions
+        init_type: "kaiming_normal"                   # Type of initialize weights of a neural network module
+
+
+    diffusion_params:
+        num_train_timesteps: 100                      # The number of timesteps between the noise and the real during training
+        beta_start: 0.0001                            # beta start parameter for the scheduler
+        beta_end: 0.06                                # beta end parameter for the scheduler
+        beta_schedule: "linear"                       # beta schedule parameter for the scheduler
+        num_max_timesteps: 100                         # The max timestep transition from real to noise
+
+
+
+###########################################################
+#                       UPDATER SETTING                   #
+###########################################################
+fs2_updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+
+ds_updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+# fastspeech2 optimizer
+fs2_optimizer:
+    optim: adam              # optimizer type
+    learning_rate: 0.001     # learning rate
+
+# diffusion optimizer
+ds_optimizer_params:
+    beta1: 0.9
+    beta2: 0.98
+    weight_decay: 0.0
+
+ds_scheduler_params:
+    learning_rate: 0.001              
+    gamma: 0.5                          
+    step_size: 50000  
+ds_grad_norm: 1
+
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+ds_train_start_steps: 160000              # Number of steps to start to train diffusion module.
+train_max_steps: 320000                   # Number of training steps.
+save_interval_steps: 2000                 # Interval steps to save checkpoint.
+eval_interval_steps: 2000                 # Interval steps to evaluate the network.
+num_snapshots: 5                        # Number of saved models
+
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
+find_unused_parameters: True
diff --git a/examples/opencpop/svs1/local/preprocess.sh b/examples/opencpop/svs1/local/preprocess.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=opencpop \
+        --rootdir=~/datasets/SVS/Opencpop/segments \
+        --dumpdir=dump \
+        --label-file=~/datasets/SVS/Opencpop/segments/transcriptions.txt \
+        --config=${config_path} \
+        --num-cpu=20 \
+        --cut-sil=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="speech"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="pitch"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="energy"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize and covert phone/speaker to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+fi
diff --git a/examples/opencpop/svs1/local/synthesize.sh b/examples/opencpop/svs1/local/synthesize.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+#iter=$3
+#ckpt_name=snapshot_iter_${iter}.pdz
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=diffsinger_opencpop \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_opencpop \
+        --voc_config=pwgan_opencpop/default.yaml \
+        --voc_ckpt=pwgan_opencpop/snapshot_iter_100000.pdz \
+        --voc_stat=pwgan_opencpop/feats_stats.npy \
+        --test_metadata=test.jsonl \
+        --output_dir=${train_output_path}/test_${iter} \
+        --phones_dict=dump/phone_id_map.txt
+fi
+
diff --git a/examples/opencpop/svs1/local/train.sh b/examples/opencpop/svs1/local/train.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1 \
+    --phones-dict=dump/phone_id_map.txt
diff --git a/examples/opencpop/svs1/path.sh b/examples/opencpop/svs1/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=diffsinger
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/opencpop/svs1/run.sh b/examples/opencpop/svs1/run.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_320000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
diff --git a/examples/opencpop/svs1/test.jsonl b/examples/opencpop/svs1/test.jsonl
@@ -0,0 +1,5 @@
+{"utt_id": "2092003428", "spk_id": 0, "text": [9, 45, 20, 6, 12, 13, 33, 14, 60, 5, 5, 2, 3, 9, 21, 10, 21], "text_lengths": 17, "note": [68, 68, 68, 68, 70, 70, 68, 68, 70, 70, 72, 0, 0, 65, 65, 72, 72], "note_dur": [0.21095, 0.21095, 0.49392, 0.49392, 0.26976, 0.26976, 0.342, 0.342, 0.32028, 0.32028, 1.47701, 0.23502, 0.1304, 0.24228, 0.24228, 0.44816, 0.44816], "is_slur": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]}
+{"utt_id": "2092003429", "spk_id": 0, "text": [12, 13, 13, 2, 33, 23, 43, 7, 7, 44, 38, 34, 28], "text_lengths": 13, "note": [72, 72, 70, 0, 68, 68, 68, 68, 67, 67, 67, 68, 68], "note_dur": [0.25521, 0.25521, 0.51253, 0.25985, 0.25585, 0.25585, 0.47119, 0.47119, 0.29975, 0.68713, 0.68713, 2.85768, 2.85768], "is_slur": [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]}
+{"utt_id": "2098003651", "spk_id": 0, "text": [40, 28, 41, 15, 59, 21, 40, 21, 3, 32, 6, 3, 19, 52, 3, 12, 52, 43, 8, 10, 21, 59, 55, 55, 33, 24, 24, 3, 2, 3], "text_lengths": 30, "note": [63, 63, 65, 65, 65, 65, 66, 66, 1, 66, 66, 1, 58, 58, 1, 65, 65, 63, 63, 63, 63, 61, 61, 59, 61, 61, 59, 1, 1, 1], "note_dur": [0.36, 0.36, 0.18629, 0.18629, 0.35408, 0.35408, 0.2435, 0.2435, 0.06635, 0.61554, 0.61554, 0.06766, 0.18553, 0.18553, 0.09065, 0.44146, 0.44146, 0.31192, 0.31192, 0.47628, 0.47628, 0.27498, 0.27498, 0.16992, 0.28934, 0.28934, 1.28618, 1.29928, 0.3935, 0.06307], "is_slur": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]}
+{"utt_id": "2098003652", "spk_id": 0, "text": [44, 4, 60, 5, 44, 23, 32, 37, 32, 6, 19, 52, 3, 12, 52, 43, 8, 10, 21, 59, 21, 57, 7, 3, 2, 3], "text_lengths": 26, "note": [63, 63, 65, 65, 65, 65, 66, 66, 66, 66, 68, 68, 1, 65, 65, 63, 63, 63, 63, 61, 61, 70, 70, 1, 1, 1], "note_dur": [0.25292, 0.25292, 0.22636, 0.22636, 0.45065, 0.45065, 0.24558, 0.24558, 0.45853, 0.45853, 0.58061, 0.58061, 0.07029, 0.31713, 0.31713, 0.2562, 0.2562, 0.46023, 0.46023, 0.52357, 0.52357, 2.11321, 2.11321, 0.18789, 0.47604, 0.07621], "is_slur": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
+{"utt_id": "2093003457", "spk_id": 0, "text": [58, 25, 31, 30, 57, 36, 11, 7, 7, 31, 26, 26, 34, 8, 2, 43, 21, 35, 21, 60, 50, 34, 14, 12, 13, 31, 21, 20, 8], "text_lengths": 29, "note": [61, 61, 66, 66, 68, 68, 70, 70, 66, 66, 66, 61, 61, 61, 0, 61, 61, 70, 70, 68, 68, 70, 70, 68, 68, 65, 65, 61, 61], "note_dur": [0.40714, 0.40714, 0.37619, 0.37619, 0.24218, 0.24218, 0.50955, 0.50955, 0.18342, 0.3154, 0.3154, 0.23502, 0.36166, 0.36166, 0.22307, 0.37727, 0.37727, 0.34055, 0.34055, 0.29962, 0.29962, 0.34451, 0.34451, 0.28377, 0.28377, 0.32339, 0.32339, 0.36034, 0.36034], "is_slur": [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}