Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[s2t] mv dataset into paddlespeech.dataset #3183

Merged
merged 3 commits into from
Apr 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 2 additions & 134 deletions dataset/aidatatang_200zh/aidatatang_200zh.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,139 +18,7 @@
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
import argparse
import codecs
import json
import os
from pathlib import Path

import soundfile

from utils.utility import download
from utils.utility import unpack

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

URL_ROOT = 'http://www.openslr.org/resources/62'
# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--target_dir",
default=DATA_HOME + "/aidatatang_200zh",
type=str,
help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
"--manifest_prefix",
default="manifest",
type=str,
help="Filepath prefix for output manifests. (default: %(default)s)")
args = parser.parse_args()


def create_manifest(data_dir, manifest_path_prefix):
print("Creating manifest %s ..." % manifest_path_prefix)
json_lines = []
transcript_path = os.path.join(data_dir, 'transcript',
'aidatatang_200_zh_transcript.txt')
transcript_dict = {}
for line in codecs.open(transcript_path, 'r', 'utf-8'):
line = line.strip()
if line == '':
continue
audio_id, text = line.split(' ', 1)
# remove withespace, charactor text
text = ''.join(text.split())
transcript_dict[audio_id] = text

data_types = ['train', 'dev', 'test']
for dtype in data_types:
del json_lines[:]
total_sec = 0.0
total_text = 0.0
total_num = 0

audio_dir = os.path.join(data_dir, 'corpus/', dtype)
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for fname in filelist:
if not fname.endswith('.wav'):
continue

audio_path = os.path.abspath(os.path.join(subfolder, fname))
audio_id = os.path.basename(fname)[:-4]
utt2spk = Path(audio_path).parent.name

audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
text = transcript_dict[audio_id]
json_lines.append(
json.dumps(
{
'utt': audio_id,
'utt2spk': str(utt2spk),
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': text,
},
ensure_ascii=False))

total_sec += duration
total_text += len(text)
total_num += 1

manifest_path = manifest_path_prefix + '.' + dtype
with codecs.open(manifest_path, 'w', 'utf-8') as fout:
for line in json_lines:
fout.write(line + '\n')

manifest_dir = os.path.dirname(manifest_path_prefix)
meta_path = os.path.join(manifest_dir, dtype) + '.meta'
with open(meta_path, 'w') as f:
print(f"{dtype}:", file=f)
print(f"{total_num} utts", file=f)
print(f"{total_sec / (60*60)} h", file=f)
print(f"{total_text} text", file=f)
print(f"{total_text / total_sec} text/sec", file=f)
print(f"{total_sec / total_num} sec/utt", file=f)


def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
"""Download, unpack and create manifest file."""
data_dir = os.path.join(target_dir, subset)
if not os.path.exists(data_dir):
filepath = download(url, md5sum, target_dir)
unpack(filepath, target_dir)
# unpack all audio tar files
audio_dir = os.path.join(data_dir, 'corpus')
for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
for sub in dirlist:
print(f"unpack dir {sub}...")
for folder, _, filelist in sorted(
os.walk(os.path.join(subfolder, sub))):
for ftar in filelist:
unpack(os.path.join(folder, ftar), folder, True)
else:
print("Skip downloading and unpacking. Data already exists in %s." %
target_dir)

create_manifest(data_dir, manifest_path)


def main():
if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir)

prepare_dataset(
url=DATA_URL,
md5sum=MD5_DATA,
target_dir=args.target_dir,
manifest_path=args.manifest_prefix,
subset='aidatatang_200zh')

print("Data download and manifest prepare done!")

from paddlespeech.dataset.aidatatang_200zh import aidatatang_200zh_main

if __name__ == '__main__':
main()
aidatatang_200zh_main()
3 changes: 0 additions & 3 deletions dataset/aishell/README.md

This file was deleted.

140 changes: 2 additions & 138 deletions dataset/aishell/aishell.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,143 +18,7 @@
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
import argparse
import codecs
import json
import os
from pathlib import Path

import soundfile

from utils.utility import download
from utils.utility import unpack

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

URL_ROOT = 'http://openslr.elda.org/resources/33'
# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
DATA_URL = URL_ROOT + '/data_aishell.tgz'
MD5_DATA = '2f494334227864a8a8fec932999db9d8'
RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--target_dir",
default=DATA_HOME + "/Aishell",
type=str,
help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
"--manifest_prefix",
default="manifest",
type=str,
help="Filepath prefix for output manifests. (default: %(default)s)")
args = parser.parse_args()


def create_manifest(data_dir, manifest_path_prefix):
print("Creating manifest %s ..." % manifest_path_prefix)
json_lines = []
transcript_path = os.path.join(data_dir, 'transcript',
'aishell_transcript_v0.8.txt')
transcript_dict = {}
for line in codecs.open(transcript_path, 'r', 'utf-8'):
line = line.strip()
if line == '':
continue
audio_id, text = line.split(' ', 1)
# remove withespace, charactor text
text = ''.join(text.split())
transcript_dict[audio_id] = text

data_types = ['train', 'dev', 'test']
for dtype in data_types:
del json_lines[:]
total_sec = 0.0
total_text = 0.0
total_num = 0

audio_dir = os.path.join(data_dir, 'wav', dtype)
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for fname in filelist:
audio_path = os.path.abspath(os.path.join(subfolder, fname))
audio_id = os.path.basename(fname)[:-4]
# if no transcription for audio then skipped
if audio_id not in transcript_dict:
continue

utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
text = transcript_dict[audio_id]
json_lines.append(
json.dumps(
{
'utt': audio_id,
'utt2spk': str(utt2spk),
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': text
},
ensure_ascii=False))

total_sec += duration
total_text += len(text)
total_num += 1

manifest_path = manifest_path_prefix + '.' + dtype
with codecs.open(manifest_path, 'w', 'utf-8') as fout:
for line in json_lines:
fout.write(line + '\n')

manifest_dir = os.path.dirname(manifest_path_prefix)
meta_path = os.path.join(manifest_dir, dtype) + '.meta'
with open(meta_path, 'w') as f:
print(f"{dtype}:", file=f)
print(f"{total_num} utts", file=f)
print(f"{total_sec / (60*60)} h", file=f)
print(f"{total_text} text", file=f)
print(f"{total_text / total_sec} text/sec", file=f)
print(f"{total_sec / total_num} sec/utt", file=f)


def prepare_dataset(url, md5sum, target_dir, manifest_path=None):
"""Download, unpack and create manifest file."""
data_dir = os.path.join(target_dir, 'data_aishell')
if not os.path.exists(data_dir):
filepath = download(url, md5sum, target_dir)
unpack(filepath, target_dir)
# unpack all audio tar files
audio_dir = os.path.join(data_dir, 'wav')
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for ftar in filelist:
unpack(os.path.join(subfolder, ftar), subfolder, True)
else:
print("Skip downloading and unpacking. Data already exists in %s." %
target_dir)

if manifest_path:
create_manifest(data_dir, manifest_path)


def main():
if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir)

prepare_dataset(
url=DATA_URL,
md5sum=MD5_DATA,
target_dir=args.target_dir,
manifest_path=args.manifest_prefix)

prepare_dataset(
url=RESOURCE_URL,
md5sum=MD5_RESOURCE,
target_dir=args.target_dir,
manifest_path=None)

print("Data download and manifest prepare done!")

from paddlespeech.dataset.aishell import aishell_main

if __name__ == '__main__':
main()
aishell_main()
4 changes: 2 additions & 2 deletions dataset/librispeech/librispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@
import distutils.util
import soundfile

from utils.utility import download
from utils.utility import unpack
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack

URL_ROOT = "http://openslr.elda.org/resources/12"
#URL_ROOT = "https://openslr.magicdatatech.com/resources/12"
Expand Down
4 changes: 2 additions & 2 deletions dataset/mini_librispeech/mini_librispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@

import soundfile

from utils.utility import download
from utils.utility import unpack
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack

URL_ROOT = "http://openslr.elda.org/resources/31"
URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz"
Expand Down
4 changes: 2 additions & 2 deletions dataset/musan/musan.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@

import soundfile

from utils.utility import download
from utils.utility import unpack
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

Expand Down
4 changes: 2 additions & 2 deletions dataset/rir_noise/rir_noise.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@

import soundfile

from utils.utility import download
from utils.utility import unzip
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unzip

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

Expand Down
4 changes: 2 additions & 2 deletions dataset/thchs30/thchs30.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@

import soundfile

from utils.utility import download
from utils.utility import unpack
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

Expand Down
2 changes: 1 addition & 1 deletion dataset/timit/timit.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

import soundfile

from utils.utility import unzip
from paddlespeech.dataset.download import unzip

URL_ROOT = ""
MD5_DATA = "45c68037c7fdfe063a43c851f181fb2d"
Expand Down
6 changes: 3 additions & 3 deletions dataset/voxceleb/voxceleb1.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@

import soundfile

from utils.utility import check_md5sum
from utils.utility import download
from utils.utility import unzip
from paddlespeech.dataset.download import check_md5sum
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unzip

# all the data will be download in the current data/voxceleb directory default
DATA_HOME = os.path.expanduser('.')
Expand Down
6 changes: 3 additions & 3 deletions dataset/voxceleb/voxceleb2.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@

import soundfile

from utils.utility import check_md5sum
from utils.utility import download
from utils.utility import unzip
from paddlespeech.dataset.download import check_md5sum
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unzip

# all the data will be download in the current data/voxceleb directory default
DATA_HOME = os.path.expanduser('.')
Expand Down
6 changes: 3 additions & 3 deletions dataset/voxforge/voxforge.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@

import soundfile

from utils.utility import download_multi
from utils.utility import getfile_insensitive
from utils.utility import unpack
from paddlespeech.dataset.download import download_multi
from paddlespeech.dataset.download import getfile_insensitive
from paddlespeech.dataset.download import unpack

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

Expand Down
Loading