Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TTS]add StarGANv2VC preprocess #3163

Merged
merged 7 commits into from
Apr 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions examples/vctk/vc3/conf/default.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,23 @@
###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################
# 其实没用上,其实用的是 16000
sr: 24000
# 源码 load 的时候用的 24k, 提取 mel 用的 16k, 后续 load 和提取 mel 都要改成 24k
fs: 16000
n_fft: 2048
win_length: 1200
hop_length: 300
n_shift: 300
win_length: 1200 # Window length.(in samples) 50ms
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.

fmin: 0 # Minimum frequency of Mel basis.
fmax: 8000 # Maximum frequency of Mel basis. sr // 2
n_mels: 80
# only for StarGANv2 VC
norm: # None here
htk: True
power: 2.0


###########################################################
# MODEL SETTING #
###########################################################
Expand Down
27 changes: 23 additions & 4 deletions examples/vctk/vc3/local/preprocess.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,32 @@ stop_stage=100
config_path=$1

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# extract features
echo "Extract features ..."
python3 ${BIN_DIR}/preprocess.py \
--dataset=vctk \
--rootdir=~/datasets/VCTK-Corpus-0.92/ \
--dumpdir=dump \
--config=${config_path} \
--num-cpu=20

fi

if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then

fi

if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Normalize ..."
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--speaker-dict=dump/speaker_id_map.txt

python3 ${BIN_DIR}/normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--speaker-dict=dump/speaker_id_map.txt

python3 ${BIN_DIR}/normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--speaker-dict=dump/speaker_id_map.txt

fi
78 changes: 66 additions & 12 deletions paddlespeech/t2s/datasets/am_batch_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -669,18 +669,72 @@ def vits_multi_spk_batch_fn(examples):
return batch


# 未完成
def starganv2_vc_batch_fn(examples):
batch = {
"x_real": None,
"y_org": None,
"x_ref": None,
"x_ref2": None,
"y_trg": None,
"z_trg": None,
"z_trg2": None,
}
return batch
# 因为要传参数,所以需要额外构建
def build_starganv2_vc_collate_fn(latent_dim: int=16, max_mel_length: int=192):

return StarGANv2VCCollateFn(
latent_dim=latent_dim, max_mel_length=max_mel_length)


class StarGANv2VCCollateFn:
"""Functor class of common_collate_fn()"""

def __init__(self, latent_dim: int=16, max_mel_length: int=192):
self.latent_dim = latent_dim
self.max_mel_length = max_mel_length

def random_clip(self, mel: np.array):
# [80, T]
mel_length = mel.shape[1]
if mel_length > self.max_mel_length:
random_start = np.random.randint(0,
mel_length - self.max_mel_length)
mel = mel[:, random_start:random_start + self.max_mel_length]
return mel

def __call__(self, exmaples):
return self.starganv2_vc_batch_fn(exmaples)

def starganv2_vc_batch_fn(self, examples):
batch_size = len(examples)

label = [np.array(item["label"], dtype=np.int64) for item in examples]
ref_label = [
np.array(item["ref_label"], dtype=np.int64) for item in examples
]

# 需要对 mel 进行裁剪
mel = [self.random_clip(item["mel"]) for item in examples]
ref_mel = [self.random_clip(item["ref_mel"]) for item in examples]
ref_mel_2 = [self.random_clip(item["ref_mel_2"]) for item in examples]

mel = batch_sequences(mel)
ref_mel = batch_sequences(ref_mel)
ref_mel_2 = batch_sequences(ref_mel_2)

# convert each batch to paddle.Tensor
# (B,)
label = paddle.to_tensor(label)
ref_label = paddle.to_tensor(ref_label)
# [B, 80, T] -> [B, 1, 80, T]
mel = paddle.to_tensor(mel)
ref_mel = paddle.to_tensor(ref_mel)
ref_mel_2 = paddle.to_tensor(ref_mel_2)

z_trg = paddle.randn(batch_size, self.latent_dim)
z_trg2 = paddle.randn(batch_size, self.latent_dim)

batch = {
"x_real": mels,
"y_org": labels,
"x_ref": ref_mels,
"x_ref2": ref_mels_2,
"y_trg": ref_labels,
"z_trg": z_trg,
"z_trg2": z_trg2
}

return batch


# for PaddleSlim
Expand Down
53 changes: 53 additions & 0 deletions paddlespeech/t2s/datasets/data_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import random
from multiprocessing import Manager
from typing import Any
from typing import Callable
from typing import Dict
from typing import List

import numpy as np
from paddle.io import Dataset


Expand Down Expand Up @@ -131,3 +133,54 @@ def __len__(self) -> int:
The length of the dataset
"""
return len(self.data)


class StarGANv2VCDataTable(DataTable):
def __init__(self, data: List[Dict[str, Any]]):
super().__init__(data)
raw_data = data
spk_id_set = list(set([item['spk_id'] for item in raw_data]))
data_list_per_class = {}
for spk_id in spk_id_set:
data_list_per_class[spk_id] = []
for item in raw_data:
for spk_id in spk_id_set:
if item['spk_id'] == spk_id:
data_list_per_class[spk_id].append(item)
self.data_list_per_class = data_list_per_class

def __getitem__(self, idx: int) -> Dict[str, Any]:
"""Get an example given an index.
Args:
idx (int): Index of the example to get
Returns:
Dict[str, Any]: A converted example
"""
if self.use_cache and self.caches[idx] is not None:
return self.caches[idx]

data = self._get_metadata(idx)

# 裁剪放到 batch_fn 里面
# 返回一个字典
"""
{'utt_id': 'p225_111', 'spk_id': '1', 'speech': 'path of *.npy'}
"""
ref_data = random.choice(self.data)
ref_label = ref_data['spk_id']
ref_data_2 = random.choice(self.data_list_per_class[ref_label])
# mel_tensor, label, ref_mel_tensor, ref2_mel_tensor, ref_label
new_example = {
'utt_id': data['utt_id'],
'mel': np.load(data['speech']),
'label': int(data['spk_id']),
'ref_mel': np.load(ref_data['speech']),
'ref_mel_2': np.load(ref_data_2['speech']),
'ref_label': int(ref_label)
}

if self.use_cache:
self.caches[idx] = new_example

return new_example
101 changes: 101 additions & 0 deletions paddlespeech/t2s/exps/starganv2_vc/normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Normalize feature files and dump them."""
import argparse
import logging
from operator import itemgetter
from pathlib import Path

import jsonlines
import numpy as np
import tqdm

from paddlespeech.t2s.datasets.data_table import DataTable


def main():
"""Run preprocessing process."""
parser = argparse.ArgumentParser(
description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
)
parser.add_argument(
"--metadata",
type=str,
required=True,
help="directory including feature files to be normalized. "
"you need to specify either *-scp or rootdir.")

parser.add_argument(
"--dumpdir",
type=str,
required=True,
help="directory to dump normalized feature files.")

parser.add_argument(
"--speaker-dict", type=str, default=None, help="speaker id map file.")

args = parser.parse_args()

dumpdir = Path(args.dumpdir).expanduser()
# use absolute path
dumpdir = dumpdir.resolve()
dumpdir.mkdir(parents=True, exist_ok=True)

# get dataset
with jsonlines.open(args.metadata, 'r') as reader:
metadata = list(reader)
dataset = DataTable(
metadata, converters={
"speech": np.load,
})
logging.info(f"The number of files = {len(dataset)}.")

vocab_speaker = {}
with open(args.speaker_dict, 'rt') as f:
spk_id = [line.strip().split() for line in f.readlines()]
for spk, id in spk_id:
vocab_speaker[spk] = int(id)

# process each file
output_metadata = []

for item in tqdm.tqdm(dataset):
utt_id = item['utt_id']
speech = item['speech']

# normalize
# 这里暂时写死
mean, std = -4, 4
speech = (speech - mean) / std
speech_path = dumpdir / f"{utt_id}_speech.npy"
np.save(speech_path, speech.astype(np.float32), allow_pickle=False)

spk_id = vocab_speaker[item["speaker"]]
record = {
"utt_id": item['utt_id'],
"spk_id": spk_id,
"speech": str(speech_path),
}

output_metadata.append(record)
output_metadata.sort(key=itemgetter('utt_id'))
output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
with jsonlines.open(output_metadata_path, 'w') as writer:
for item in output_metadata:
writer.write(item)
logging.info(f"metadata dumped into {output_metadata_path}")


if __name__ == "__main__":
main()
Loading